1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 /* 3 * This file is part of the LibreOffice project. 4 * 5 * This Source Code Form is subject to the terms of the Mozilla Public 6 * License, v. 2.0. If a copy of the MPL was not distributed with this 7 * file, You can obtain one at http://mozilla.org/MPL/2.0/. 8 * 9 * This file incorporates work covered by the following license notice: 10 * 11 * Licensed to the Apache Software Foundation (ASF) under one or more 12 * contributor license agreements. See the NOTICE file distributed 13 * with this work for additional information regarding copyright 14 * ownership. The ASF licenses this file to you under the Apache 15 * License, Version 2.0 (the "License"); you may not use this file 16 * except in compliance with the License. You may obtain a copy of 17 * the License at http://www.apache.org/licenses/LICENSE-2.0 . 18 */ 19 20 #include <config_locales.h> 21 22 #include <sal/log.hxx> 23 24 #include <lrl_include.hxx> 25 26 #include <rtl/ustrbuf.hxx> 27 #include <i18nlangtag/languagetag.hxx> 28 #include <i18nlangtag/languagetagicu.hxx> 29 #include <collator_unicode.hxx> 30 #include <localedata.hxx> 31 #include <com/sun/star/i18n/CollatorOptions.hpp> 32 #include <cppuhelper/supportsservice.hxx> 33 34 using namespace ::com::sun::star; 35 using namespace ::com::sun::star::i18n; 36 using namespace ::com::sun::star::lang; 37 using namespace ::com::sun::star::uno; 38 39 namespace i18npool { 40 41 const char implementationName[] = "com.sun.star.i18n.Collator_Unicode"; 42 43 Collator_Unicode::Collator_Unicode() 44 { 45 collator = nullptr; 46 uca_base = nullptr; 47 #ifndef DISABLE_DYNLOADING 48 hModule = nullptr; 49 #endif 50 } 51 52 Collator_Unicode::~Collator_Unicode() 53 { 54 collator.reset(); 55 uca_base.reset(); 56 #ifndef DISABLE_DYNLOADING 57 if (hModule) osl_unloadModule(hModule); 58 #endif 59 } 60 61 #ifdef DISABLE_DYNLOADING 62 63 extern "C" { 64 65 // For DISABLE_DYNLOADING the generated functions have names that 66 // start with get_collator_data_ to avoid clashing with a few 67 // functions in the generated libindex_data that are called just 68 // get_zh_pinyin for instance. 69 70 const sal_uInt8* get_collator_data_ca_charset(); 71 const sal_uInt8* get_collator_data_cu_charset(); 72 const sal_uInt8* get_collator_data_dz_charset(); 73 const sal_uInt8* get_collator_data_hu_charset(); 74 const sal_uInt8* get_collator_data_ja_charset(); 75 const sal_uInt8* get_collator_data_ja_phonetic_alphanumeric_first(); 76 const sal_uInt8* get_collator_data_ja_phonetic_alphanumeric_last(); 77 const sal_uInt8* get_collator_data_ko_charset(); 78 const sal_uInt8* get_collator_data_ku_alphanumeric(); 79 const sal_uInt8* get_collator_data_ln_charset(); 80 const sal_uInt8* get_collator_data_my_dictionary(); 81 const sal_uInt8* get_collator_data_ne_charset(); 82 const sal_uInt8* get_collator_data_sid_charset(); 83 const sal_uInt8* get_collator_data_zh_TW_charset(); 84 const sal_uInt8* get_collator_data_zh_TW_radical(); 85 const sal_uInt8* get_collator_data_zh_TW_stroke(); 86 const sal_uInt8* get_collator_data_zh_charset(); 87 const sal_uInt8* get_collator_data_zh_pinyin(); 88 const sal_uInt8* get_collator_data_zh_radical(); 89 const sal_uInt8* get_collator_data_zh_stroke(); 90 const sal_uInt8* get_collator_data_zh_zhuyin(); 91 92 size_t get_collator_data_ca_charset_length(); 93 size_t get_collator_data_cu_charset_length(); 94 size_t get_collator_data_dz_charset_length(); 95 size_t get_collator_data_hu_charset_length(); 96 size_t get_collator_data_ja_charset_length(); 97 size_t get_collator_data_ja_phonetic_alphanumeric_first_length(); 98 size_t get_collator_data_ja_phonetic_alphanumeric_last_length(); 99 size_t get_collator_data_ko_charset_length(); 100 size_t get_collator_data_ku_alphanumeric_length(); 101 size_t get_collator_data_ln_charset_length(); 102 size_t get_collator_data_my_dictionary_length(); 103 size_t get_collator_data_ne_charset_length(); 104 size_t get_collator_data_sid_charset_length(); 105 size_t get_collator_data_zh_TW_charset_length(); 106 size_t get_collator_data_zh_TW_radical_length(); 107 size_t get_collator_data_zh_TW_stroke_length(); 108 size_t get_collator_data_zh_charset_length(); 109 size_t get_collator_data_zh_pinyin_length(); 110 size_t get_collator_data_zh_radical_length(); 111 size_t get_collator_data_zh_stroke_length(); 112 size_t get_collator_data_zh_zhuyin_length(); 113 114 } 115 116 #endif 117 118 sal_Int32 SAL_CALL 119 Collator_Unicode::compareSubstring( const OUString& str1, sal_Int32 off1, sal_Int32 len1, 120 const OUString& str2, sal_Int32 off2, sal_Int32 len2) 121 { 122 return collator->compare(reinterpret_cast<const UChar *>(str1.getStr()) + off1, len1, reinterpret_cast<const UChar *>(str2.getStr()) + off2, len2); 123 } 124 125 sal_Int32 SAL_CALL 126 Collator_Unicode::compareString( const OUString& str1, const OUString& str2) 127 { 128 return collator->compare(reinterpret_cast<const UChar *>(str1.getStr()), str1.getLength(), 129 reinterpret_cast<const UChar *>(str2.getStr()), str2.getLength()); 130 } 131 132 #ifndef DISABLE_DYNLOADING 133 134 extern "C" { static void thisModule() {} } 135 136 #endif 137 138 sal_Int32 SAL_CALL 139 Collator_Unicode::loadCollatorAlgorithm(const OUString& rAlgorithm, const lang::Locale& rLocale, sal_Int32 options) 140 { 141 if (!collator) { 142 UErrorCode status = U_ZERO_ERROR; 143 OUString rule = LocaleDataImpl::get()->getCollatorRuleByAlgorithm(rLocale, rAlgorithm); 144 if (!rule.isEmpty()) { 145 collator.reset( new icu::RuleBasedCollator(reinterpret_cast<const UChar *>(rule.getStr()), status) ); 146 if (! U_SUCCESS(status)) { 147 OUString message = "icu::RuleBasedCollator ctor failed: " + OUString::createFromAscii(u_errorName(status)); 148 SAL_WARN("i18npool", message); 149 throw RuntimeException(message); 150 } 151 } 152 if (!collator && OUString(LOCAL_RULE_LANGS).indexOf(rLocale.Language) >= 0) { 153 const sal_uInt8* (*func)() = nullptr; 154 size_t (*funclen)() = nullptr; 155 156 #ifndef DISABLE_DYNLOADING 157 OUStringBuffer aBuf; 158 #ifdef SAL_DLLPREFIX 159 aBuf.append(SAL_DLLPREFIX); 160 #endif 161 aBuf.append( "collator_data" ).append( SAL_DLLEXTENSION ); 162 hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT ); 163 if (hModule) { 164 aBuf.append("get_").append(rLocale.Language).append("_"); 165 if ( rLocale.Language == "zh" ) { 166 OUString func_base = aBuf.makeStringAndClear(); 167 if (OUString("TW HK MO").indexOf(rLocale.Country) >= 0) 168 { 169 func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(hModule, 170 OUString(func_base + "TW_" + rAlgorithm).pData)); 171 funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule, 172 OUString(func_base + "TW_" + rAlgorithm + "_length").pData)); 173 } 174 if (!func) 175 { 176 func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol( 177 hModule, OUString(func_base + rAlgorithm).pData)); 178 funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol( 179 hModule, OUString(func_base + rAlgorithm + "_length").pData)); 180 } 181 } else { 182 if ( rLocale.Language == "ja" ) { 183 // replace algorithm name to implementation name. 184 if (rAlgorithm == "phonetic (alphanumeric first)") 185 aBuf.append("phonetic_alphanumeric_first"); 186 else if (rAlgorithm == "phonetic (alphanumeric last)") 187 aBuf.append("phonetic_alphanumeric_last"); 188 else 189 aBuf.append(rAlgorithm); 190 } else { 191 aBuf.append(rAlgorithm); 192 } 193 OUString func_base = aBuf.makeStringAndClear(); 194 OUString funclen_base = func_base + "_length"; 195 func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(hModule, func_base.pData)); 196 funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule, funclen_base.pData)); 197 } 198 } 199 #else 200 if (false) { 201 ; 202 #if WITH_LOCALE_ALL || WITH_LOCALE_ca 203 } else if ( rLocale.Language == "ca" ) { 204 if ( rAlgorithm == "charset" ) 205 { 206 func = get_collator_data_ca_charset; 207 funclen = get_collator_data_ca_charset_length; 208 } 209 #endif 210 #if WITH_LOCALE_ALL || WITH_LOCALE_cu 211 } else if ( rLocale.Language == "cu" ) { 212 if ( rAlgorithm == "charset" ) 213 { 214 func = get_collator_data_cu_charset; 215 funclen = get_collator_data_cu_charset_length; 216 } 217 #endif 218 #if WITH_LOCALE_ALL || WITH_LOCALE_dz 219 } else if ( rLocale.Language == "dz" || rLocale.Language == "bo" ) { 220 // 'bo' Tibetan uses the same collation rules as 'dz' Dzongkha 221 if ( rAlgorithm == "charset" ) 222 { 223 func = get_collator_data_dz_charset; 224 funclen = get_collator_data_dz_charset_length; 225 } 226 #endif 227 #if WITH_LOCALE_ALL || WITH_LOCALE_hu 228 } else if ( rLocale.Language == "hu" ) { 229 if ( rAlgorithm == "charset" ) 230 { 231 func = get_collator_data_hu_charset; 232 funclen = get_collator_data_hu_charset_length; 233 } 234 #endif 235 #if WITH_LOCALE_ALL || WITH_LOCALE_ja 236 } else if ( rLocale.Language == "ja" ) { 237 if ( rAlgorithm == "charset" ) 238 { 239 func = get_collator_data_ja_charset; 240 funclen = get_collator_data_ja_charset_length; 241 } 242 else if ( rAlgorithm == "phonetic (alphanumeric first)" ) 243 { 244 func = get_collator_data_ja_phonetic_alphanumeric_first; 245 funclen = get_collator_data_ja_phonetic_alphanumeric_first_length; 246 } 247 else if ( rAlgorithm == "phonetic (alphanumeric last)" ) 248 { 249 func = get_collator_data_ja_phonetic_alphanumeric_last; 250 funclen = get_collator_data_ja_phonetic_alphanumeric_last_length; 251 } 252 #endif 253 #if WITH_LOCALE_ALL || WITH_LOCALE_ko 254 #if (U_ICU_VERSION_MAJOR_NUM < 53) 255 } else if ( rLocale.Language == "ko" ) { 256 if ( rAlgorithm == "charset" ) 257 { 258 func = get_collator_data_ko_charset; 259 funclen = get_collator_data_ko_charset_length; 260 } 261 #endif 262 #endif 263 #if WITH_LOCALE_ALL || WITH_LOCALE_ku 264 } else if ( rLocale.Language == "ku" ) { 265 if ( rAlgorithm == "alphanumeric" ) 266 { 267 func = get_collator_data_ku_alphanumeric; 268 funclen = get_collator_data_ku_alphanumeric_length; 269 } 270 #endif 271 #if WITH_LOCALE_ALL || WITH_LOCALE_ln 272 } else if ( rLocale.Language == "ln" ) { 273 if ( rAlgorithm == "charset" ) 274 { 275 func = get_collator_data_ln_charset; 276 funclen = get_collator_data_ln_charset_length; 277 } 278 #endif 279 #if WITH_LOCALE_ALL || WITH_LOCALE_my 280 } else if ( rLocale.Language == "my" ) { 281 if ( rAlgorithm == "dictionary" ) 282 { 283 func = get_collator_data_my_dictionary; 284 funclen = get_collator_data_my_dictionary_length; 285 } 286 #endif 287 #if WITH_LOCALE_ALL || WITH_LOCALE_ne 288 } else if ( rLocale.Language == "ne" ) { 289 if ( rAlgorithm == "charset" ) 290 { 291 func = get_collator_data_ne_charset; 292 funclen = get_collator_data_ne_charset_length; 293 } 294 #endif 295 #if WITH_LOCALE_ALL || WITH_LOCALE_sid 296 } else if ( rLocale.Language == "sid" ) { 297 if ( rAlgorithm == "charset" ) 298 { 299 func = get_collator_data_sid_charset; 300 funclen = get_collator_data_sid_charset_length; 301 } 302 #endif 303 #if WITH_LOCALE_ALL || WITH_LOCALE_zh 304 } else if ( rLocale.Language == "zh" && (rLocale.Country == "TW" || rLocale.Country == "HK" || rLocale.Country == "MO") ) { 305 if ( rAlgorithm == "charset" ) 306 { 307 func = get_collator_data_zh_TW_charset; 308 funclen = get_collator_data_zh_TW_charset_length; 309 } 310 else if ( rAlgorithm == "radical" ) 311 { 312 func = get_collator_data_zh_TW_radical; 313 funclen = get_collator_data_zh_TW_radical_length; 314 } 315 else if ( rAlgorithm == "stroke" ) 316 { 317 func = get_collator_data_zh_TW_stroke; 318 funclen = get_collator_data_zh_TW_stroke_length; 319 } 320 } else if ( rLocale.Language == "zh" ) { 321 if ( rAlgorithm == "charset" ) 322 { 323 func = get_collator_data_zh_charset; 324 funclen = get_collator_data_zh_charset_length; 325 } 326 else if ( rAlgorithm == "pinyin" ) 327 { 328 func = get_collator_data_zh_pinyin; 329 funclen = get_collator_data_zh_pinyin_length; 330 } 331 else if ( rAlgorithm == "radical" ) 332 { 333 func = get_collator_data_zh_radical; 334 funclen = get_collator_data_zh_radical_length; 335 } 336 else if ( rAlgorithm == "stroke" ) 337 { 338 func = get_collator_data_zh_stroke; 339 funclen = get_collator_data_zh_stroke_length; 340 } 341 else if ( rAlgorithm == "zhuyin" ) 342 { 343 func = get_collator_data_zh_zhuyin; 344 funclen = get_collator_data_zh_zhuyin_length; 345 } 346 #endif 347 } 348 #endif // DISABLE_DYNLOADING 349 if (func && funclen) { 350 const sal_uInt8* ruleImage=func(); 351 size_t ruleImageSize = funclen(); 352 353 #if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2) 354 uca_base = new icu::RuleBasedCollator(static_cast<UChar*>(NULL), status); 355 #else 356 // Not only changed ICU 53.1 the API behavior that a negative 357 // length (ruleImageSize) now leads to failure, but also that 358 // the base RuleBasedCollator passed as uca_base here needs to 359 // have a base->tailoring == CollationRoot::getRoot() otherwise 360 // the init bails out as well, as it does for the previously 361 // used "empty" RuleBasedCollator. 362 // The default collator of the en-US locale would also fulfill 363 // the requirement. The collator of the actual locale or the 364 // NULL (default) locale does not. 365 uca_base.reset( static_cast<icu::RuleBasedCollator*>(icu::Collator::createInstance( 366 icu::Locale::getRoot(), status)) ); 367 #endif 368 if (! U_SUCCESS(status)) { 369 OUString message = "icu::Collator::createInstance() failed: " + OUString::createFromAscii(u_errorName(status)); 370 SAL_WARN("i18npool", message); 371 throw RuntimeException(message); 372 } 373 collator.reset( new icu::RuleBasedCollator( 374 reinterpret_cast<const uint8_t*>(ruleImage), ruleImageSize, uca_base.get(), status) ); 375 if (! U_SUCCESS(status)) { 376 OUString message = "icu::RuleBasedCollator ctor failed: " + OUString::createFromAscii(u_errorName(status)); 377 SAL_WARN("i18npool", message); 378 throw RuntimeException(message); 379 } 380 } 381 } 382 if (!collator) { 383 /** ICU collators are loaded using a locale only. 384 ICU uses Variant as collation algorithm name (like de__PHONEBOOK 385 locale), note the empty territory (Country) designator in this special 386 case here. 387 But sometimes the mapping fails, eg for German (from Germany) phonebook, we'll have "de_DE_PHONEBOOK" 388 this one won't be remapping to collation keyword specifiers "de@collation=phonebook" 389 See http://userguide.icu-project.org/locale#TOC-Variant-code, Level 2 canonicalization, 8. 390 So let variant empty and use the fourth arg of icuLocale "keywords" 391 See LanguageTagIcu::getIcuLocale from i18nlangtag/source/languagetag/languagetagicu.cxx 392 The icu::Locale constructor changes the algorithm name to 393 uppercase itself, so we don't have to bother with that. 394 */ 395 icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale), 396 "", rAlgorithm.isEmpty() ? OUString("") : "collation=" + rAlgorithm)); 397 398 // load ICU collator 399 collator.reset( static_cast<icu::RuleBasedCollator*>( icu::Collator::createInstance(icuLocale, status) ) ); 400 if (! U_SUCCESS(status)) { 401 OUString message = "icu::Collator::createInstance() failed: " + OUString::createFromAscii(u_errorName(status)); 402 SAL_WARN("i18npool", message); 403 throw RuntimeException(message); 404 } 405 } 406 } 407 408 if (options & CollatorOptions::CollatorOptions_IGNORE_CASE_ACCENT) 409 collator->setStrength(icu::Collator::PRIMARY); 410 else if (options & CollatorOptions::CollatorOptions_IGNORE_CASE) 411 collator->setStrength(icu::Collator::SECONDARY); 412 else 413 collator->setStrength(icu::Collator::TERTIARY); 414 415 return 0; 416 } 417 418 419 OUString SAL_CALL 420 Collator_Unicode::getImplementationName() 421 { 422 return implementationName; 423 } 424 425 sal_Bool SAL_CALL 426 Collator_Unicode::supportsService(const OUString& rServiceName) 427 { 428 return cppu::supportsService(this, rServiceName); 429 } 430 431 Sequence< OUString > SAL_CALL 432 Collator_Unicode::getSupportedServiceNames() 433 { 434 Sequence< OUString > aRet { OUString(implementationName) }; 435 return aRet; 436 } 437 438 } 439 440 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ 441
