1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <config_locales.h>
21 
22 #include <sal/log.hxx>
23 
24 #include <lrl_include.hxx>
25 
26 #include <rtl/ustrbuf.hxx>
27 #include <i18nlangtag/languagetag.hxx>
28 #include <i18nlangtag/languagetagicu.hxx>
29 #include <collator_unicode.hxx>
30 #include <localedata.hxx>
31 #include <com/sun/star/i18n/CollatorOptions.hpp>
32 #include <cppuhelper/supportsservice.hxx>
33 
34 using namespace ::com::sun::star;
35 using namespace ::com::sun::star::i18n;
36 using namespace ::com::sun::star::lang;
37 using namespace ::com::sun::star::uno;
38 
39 namespace i18npool {
40 
41 const char implementationName[] = "com.sun.star.i18n.Collator_Unicode";
42 
43 Collator_Unicode::Collator_Unicode()
44 {
45     collator = nullptr;
46     uca_base = nullptr;
47 #ifndef DISABLE_DYNLOADING
48     hModule = nullptr;
49 #endif
50 }
51 
52 Collator_Unicode::~Collator_Unicode()
53 {
54     collator.reset();
55     uca_base.reset();
56 #ifndef DISABLE_DYNLOADING
57     if (hModule) osl_unloadModule(hModule);
58 #endif
59 }
60 
61 #ifdef DISABLE_DYNLOADING
62 
63 extern "C" {
64 
65 // For DISABLE_DYNLOADING the generated functions have names that
66 // start with get_collator_data_ to avoid clashing with a few
67 // functions in the generated libindex_data that are called just
68 // get_zh_pinyin for instance.
69 
70 const sal_uInt8* get_collator_data_ca_charset();
71 const sal_uInt8* get_collator_data_cu_charset();
72 const sal_uInt8* get_collator_data_dz_charset();
73 const sal_uInt8* get_collator_data_hu_charset();
74 const sal_uInt8* get_collator_data_ja_charset();
75 const sal_uInt8* get_collator_data_ja_phonetic_alphanumeric_first();
76 const sal_uInt8* get_collator_data_ja_phonetic_alphanumeric_last();
77 const sal_uInt8* get_collator_data_ko_charset();
78 const sal_uInt8* get_collator_data_ku_alphanumeric();
79 const sal_uInt8* get_collator_data_ln_charset();
80 const sal_uInt8* get_collator_data_my_dictionary();
81 const sal_uInt8* get_collator_data_ne_charset();
82 const sal_uInt8* get_collator_data_sid_charset();
83 const sal_uInt8* get_collator_data_zh_TW_charset();
84 const sal_uInt8* get_collator_data_zh_TW_radical();
85 const sal_uInt8* get_collator_data_zh_TW_stroke();
86 const sal_uInt8* get_collator_data_zh_charset();
87 const sal_uInt8* get_collator_data_zh_pinyin();
88 const sal_uInt8* get_collator_data_zh_radical();
89 const sal_uInt8* get_collator_data_zh_stroke();
90 const sal_uInt8* get_collator_data_zh_zhuyin();
91 
92 size_t get_collator_data_ca_charset_length();
93 size_t get_collator_data_cu_charset_length();
94 size_t get_collator_data_dz_charset_length();
95 size_t get_collator_data_hu_charset_length();
96 size_t get_collator_data_ja_charset_length();
97 size_t get_collator_data_ja_phonetic_alphanumeric_first_length();
98 size_t get_collator_data_ja_phonetic_alphanumeric_last_length();
99 size_t get_collator_data_ko_charset_length();
100 size_t get_collator_data_ku_alphanumeric_length();
101 size_t get_collator_data_ln_charset_length();
102 size_t get_collator_data_my_dictionary_length();
103 size_t get_collator_data_ne_charset_length();
104 size_t get_collator_data_sid_charset_length();
105 size_t get_collator_data_zh_TW_charset_length();
106 size_t get_collator_data_zh_TW_radical_length();
107 size_t get_collator_data_zh_TW_stroke_length();
108 size_t get_collator_data_zh_charset_length();
109 size_t get_collator_data_zh_pinyin_length();
110 size_t get_collator_data_zh_radical_length();
111 size_t get_collator_data_zh_stroke_length();
112 size_t get_collator_data_zh_zhuyin_length();
113 
114 }
115 
116 #endif
117 
118 sal_Int32 SAL_CALL
119 Collator_Unicode::compareSubstring( const OUString& str1, sal_Int32 off1, sal_Int32 len1,
120     const OUString& str2, sal_Int32 off2, sal_Int32 len2)
121 {
122     return collator->compare(reinterpret_cast<const UChar *>(str1.getStr()) + off1, len1, reinterpret_cast<const UChar *>(str2.getStr()) + off2, len2);
123 }
124 
125 sal_Int32 SAL_CALL
126 Collator_Unicode::compareString( const OUString& str1, const OUString& str2)
127 {
128     return collator->compare(reinterpret_cast<const UChar *>(str1.getStr()), str1.getLength(),
129                              reinterpret_cast<const UChar *>(str2.getStr()), str2.getLength());
130 }
131 
132 #ifndef DISABLE_DYNLOADING
133 
134 extern "C" { static void thisModule() {} }
135 
136 #endif
137 
138 sal_Int32 SAL_CALL
139 Collator_Unicode::loadCollatorAlgorithm(const OUString& rAlgorithm, const lang::Locale& rLocale, sal_Int32 options)
140 {
141     if (!collator) {
142         UErrorCode status = U_ZERO_ERROR;
143         OUString rule = LocaleDataImpl::get()->getCollatorRuleByAlgorithm(rLocale, rAlgorithm);
144         if (!rule.isEmpty()) {
145             collator.reset( new icu::RuleBasedCollator(reinterpret_cast<const UChar *>(rule.getStr()), status) );
146             if (! U_SUCCESS(status)) {
147                 OUString message = "icu::RuleBasedCollator ctor failed: " + OUString::createFromAscii(u_errorName(status));
148                 SAL_WARN("i18npool", message);
149                 throw RuntimeException(message);
150             }
151         }
152         if (!collator && OUString(LOCAL_RULE_LANGS).indexOf(rLocale.Language) >= 0) {
153             const sal_uInt8* (*func)() = nullptr;
154             size_t (*funclen)() = nullptr;
155 
156 #ifndef DISABLE_DYNLOADING
157             OUStringBuffer aBuf;
158 #ifdef SAL_DLLPREFIX
159             aBuf.append(SAL_DLLPREFIX);
160 #endif
161             aBuf.append( "collator_data" ).append( SAL_DLLEXTENSION );
162             hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
163             if (hModule) {
164                 aBuf.append("get_").append(rLocale.Language).append("_");
165                 if ( rLocale.Language == "zh" ) {
166                     OUString func_base = aBuf.makeStringAndClear();
167                     if (OUString("TW HK MO").indexOf(rLocale.Country) >= 0)
168                     {
169                         func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(hModule,
170                                     OUString(func_base + "TW_" + rAlgorithm).pData));
171                         funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule,
172                                     OUString(func_base + "TW_" + rAlgorithm + "_length").pData));
173                     }
174                     if (!func)
175                     {
176                         func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(
177                                 hModule, OUString(func_base + rAlgorithm).pData));
178                         funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(
179                                 hModule, OUString(func_base + rAlgorithm + "_length").pData));
180                     }
181                 } else {
182                     if ( rLocale.Language == "ja" ) {
183                         // replace algorithm name to implementation name.
184                         if (rAlgorithm == "phonetic (alphanumeric first)")
185                             aBuf.append("phonetic_alphanumeric_first");
186                         else if (rAlgorithm == "phonetic (alphanumeric last)")
187                             aBuf.append("phonetic_alphanumeric_last");
188                         else
189                             aBuf.append(rAlgorithm);
190                     } else {
191                         aBuf.append(rAlgorithm);
192                     }
193                     OUString func_base = aBuf.makeStringAndClear();
194                     OUString funclen_base = func_base + "_length";
195                     func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(hModule, func_base.pData));
196                     funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule, funclen_base.pData));
197                 }
198             }
199 #else
200             if (false) {
201                 ;
202 #if WITH_LOCALE_ALL || WITH_LOCALE_ca
203             } else if ( rLocale.Language == "ca" ) {
204                 if ( rAlgorithm == "charset" )
205                 {
206                     func = get_collator_data_ca_charset;
207                     funclen = get_collator_data_ca_charset_length;
208                 }
209 #endif
210 #if WITH_LOCALE_ALL || WITH_LOCALE_cu
211             } else if ( rLocale.Language == "cu" ) {
212                 if ( rAlgorithm == "charset" )
213                 {
214                     func = get_collator_data_cu_charset;
215                     funclen = get_collator_data_cu_charset_length;
216                 }
217 #endif
218 #if WITH_LOCALE_ALL || WITH_LOCALE_dz
219             } else if ( rLocale.Language == "dz" || rLocale.Language == "bo" ) {
220                 // 'bo' Tibetan uses the same collation rules as 'dz' Dzongkha
221                 if ( rAlgorithm == "charset" )
222                 {
223                     func = get_collator_data_dz_charset;
224                     funclen = get_collator_data_dz_charset_length;
225                 }
226 #endif
227 #if WITH_LOCALE_ALL || WITH_LOCALE_hu
228             } else if ( rLocale.Language == "hu" ) {
229                 if ( rAlgorithm == "charset" )
230                 {
231                     func = get_collator_data_hu_charset;
232                     funclen = get_collator_data_hu_charset_length;
233                 }
234 #endif
235 #if WITH_LOCALE_ALL || WITH_LOCALE_ja
236             } else if ( rLocale.Language == "ja" ) {
237                 if ( rAlgorithm == "charset" )
238                 {
239                     func = get_collator_data_ja_charset;
240                     funclen = get_collator_data_ja_charset_length;
241                 }
242                 else if ( rAlgorithm == "phonetic (alphanumeric first)" )
243                 {
244                     func = get_collator_data_ja_phonetic_alphanumeric_first;
245                     funclen = get_collator_data_ja_phonetic_alphanumeric_first_length;
246                 }
247                 else if ( rAlgorithm == "phonetic (alphanumeric last)" )
248                 {
249                     func = get_collator_data_ja_phonetic_alphanumeric_last;
250                     funclen = get_collator_data_ja_phonetic_alphanumeric_last_length;
251                 }
252 #endif
253 #if WITH_LOCALE_ALL || WITH_LOCALE_ko
254 #if (U_ICU_VERSION_MAJOR_NUM < 53)
255             } else if ( rLocale.Language == "ko" ) {
256                 if ( rAlgorithm == "charset" )
257                 {
258                     func = get_collator_data_ko_charset;
259                     funclen = get_collator_data_ko_charset_length;
260                 }
261 #endif
262 #endif
263 #if WITH_LOCALE_ALL || WITH_LOCALE_ku
264             } else if ( rLocale.Language == "ku" ) {
265                 if ( rAlgorithm == "alphanumeric" )
266                 {
267                     func = get_collator_data_ku_alphanumeric;
268                     funclen = get_collator_data_ku_alphanumeric_length;
269                 }
270 #endif
271 #if WITH_LOCALE_ALL || WITH_LOCALE_ln
272             } else if ( rLocale.Language == "ln" ) {
273                 if ( rAlgorithm == "charset" )
274                 {
275                     func = get_collator_data_ln_charset;
276                     funclen = get_collator_data_ln_charset_length;
277                 }
278 #endif
279 #if WITH_LOCALE_ALL || WITH_LOCALE_my
280             } else if ( rLocale.Language == "my" ) {
281                 if ( rAlgorithm == "dictionary" )
282                 {
283                     func = get_collator_data_my_dictionary;
284                     funclen = get_collator_data_my_dictionary_length;
285                 }
286 #endif
287 #if WITH_LOCALE_ALL || WITH_LOCALE_ne
288             } else if ( rLocale.Language == "ne" ) {
289                 if ( rAlgorithm == "charset" )
290                 {
291                     func = get_collator_data_ne_charset;
292                     funclen = get_collator_data_ne_charset_length;
293                 }
294 #endif
295 #if WITH_LOCALE_ALL || WITH_LOCALE_sid
296             } else if ( rLocale.Language == "sid" ) {
297                 if ( rAlgorithm == "charset" )
298                 {
299                     func = get_collator_data_sid_charset;
300                     funclen = get_collator_data_sid_charset_length;
301                 }
302 #endif
303 #if WITH_LOCALE_ALL || WITH_LOCALE_zh
304             } else if ( rLocale.Language == "zh" && (rLocale.Country == "TW" || rLocale.Country == "HK" || rLocale.Country == "MO") ) {
305                 if ( rAlgorithm == "charset" )
306                 {
307                     func = get_collator_data_zh_TW_charset;
308                     funclen = get_collator_data_zh_TW_charset_length;
309                 }
310                 else if ( rAlgorithm == "radical" )
311                 {
312                     func = get_collator_data_zh_TW_radical;
313                     funclen = get_collator_data_zh_TW_radical_length;
314                 }
315                 else if ( rAlgorithm == "stroke" )
316                 {
317                     func = get_collator_data_zh_TW_stroke;
318                     funclen = get_collator_data_zh_TW_stroke_length;
319                 }
320             } else if ( rLocale.Language == "zh" ) {
321                 if ( rAlgorithm == "charset" )
322                 {
323                     func = get_collator_data_zh_charset;
324                     funclen = get_collator_data_zh_charset_length;
325                 }
326                 else if ( rAlgorithm == "pinyin" )
327                 {
328                     func = get_collator_data_zh_pinyin;
329                     funclen = get_collator_data_zh_pinyin_length;
330                 }
331                 else if ( rAlgorithm == "radical" )
332                 {
333                     func = get_collator_data_zh_radical;
334                     funclen = get_collator_data_zh_radical_length;
335                 }
336                 else if ( rAlgorithm == "stroke" )
337                 {
338                     func = get_collator_data_zh_stroke;
339                     funclen = get_collator_data_zh_stroke_length;
340                 }
341                 else if ( rAlgorithm == "zhuyin" )
342                 {
343                     func = get_collator_data_zh_zhuyin;
344                     funclen = get_collator_data_zh_zhuyin_length;
345                 }
346 #endif
347             }
348 #endif // DISABLE_DYNLOADING
349             if (func && funclen) {
350                 const sal_uInt8* ruleImage=func();
351                 size_t ruleImageSize = funclen();
352 
353 #if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
354                 uca_base = new icu::RuleBasedCollator(static_cast<UChar*>(NULL), status);
355 #else
356                 // Not only changed ICU 53.1 the API behavior that a negative
357                 // length (ruleImageSize) now leads to failure, but also that
358                 // the base RuleBasedCollator passed as uca_base here needs to
359                 // have a base->tailoring == CollationRoot::getRoot() otherwise
360                 // the init bails out as well, as it does for the previously
361                 // used "empty" RuleBasedCollator.
362                 // The default collator of the en-US locale would also fulfill
363                 // the requirement. The collator of the actual locale or the
364                 // NULL (default) locale does not.
365                 uca_base.reset( static_cast<icu::RuleBasedCollator*>(icu::Collator::createInstance(
366                             icu::Locale::getRoot(), status)) );
367 #endif
368                 if (! U_SUCCESS(status)) {
369                     OUString message = "icu::Collator::createInstance() failed: " + OUString::createFromAscii(u_errorName(status));
370                     SAL_WARN("i18npool", message);
371                     throw RuntimeException(message);
372                 }
373                 collator.reset( new icu::RuleBasedCollator(
374                         reinterpret_cast<const uint8_t*>(ruleImage), ruleImageSize, uca_base.get(), status) );
375                 if (! U_SUCCESS(status)) {
376                     OUString message = "icu::RuleBasedCollator ctor failed: " + OUString::createFromAscii(u_errorName(status));
377                     SAL_WARN("i18npool", message);
378                     throw RuntimeException(message);
379                 }
380             }
381         }
382         if (!collator) {
383             /** ICU collators are loaded using a locale only.
384                 ICU uses Variant as collation algorithm name (like de__PHONEBOOK
385                 locale), note the empty territory (Country) designator in this special
386                 case here.
387                 But sometimes the mapping fails, eg for German (from Germany) phonebook, we'll have "de_DE_PHONEBOOK"
388                 this one won't be remapping to collation keyword specifiers "de@collation=phonebook"
389                 See http://userguide.icu-project.org/locale#TOC-Variant-code, Level 2 canonicalization, 8.
390                 So let variant empty and use the fourth arg of icuLocale "keywords"
391                 See LanguageTagIcu::getIcuLocale from i18nlangtag/source/languagetag/languagetagicu.cxx
392                 The icu::Locale constructor changes the algorithm name to
393                 uppercase itself, so we don't have to bother with that.
394             */
395             icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale),
396                         "", rAlgorithm.isEmpty() ? OUString("") : "collation=" + rAlgorithm));
397 
398             // load ICU collator
399             collator.reset( static_cast<icu::RuleBasedCollator*>( icu::Collator::createInstance(icuLocale, status) ) );
400             if (! U_SUCCESS(status)) {
401                 OUString message = "icu::Collator::createInstance() failed: " + OUString::createFromAscii(u_errorName(status));
402                 SAL_WARN("i18npool", message);
403                 throw RuntimeException(message);
404             }
405         }
406     }
407 
408     if (options & CollatorOptions::CollatorOptions_IGNORE_CASE_ACCENT)
409         collator->setStrength(icu::Collator::PRIMARY);
410     else if (options & CollatorOptions::CollatorOptions_IGNORE_CASE)
411         collator->setStrength(icu::Collator::SECONDARY);
412     else
413         collator->setStrength(icu::Collator::TERTIARY);
414 
415     return 0;
416 }
417 
418 
419 OUString SAL_CALL
420 Collator_Unicode::getImplementationName()
421 {
422     return implementationName;
423 }
424 
425 sal_Bool SAL_CALL
426 Collator_Unicode::supportsService(const OUString& rServiceName)
427 {
428     return cppu::supportsService(this, rServiceName);
429 }
430 
431 Sequence< OUString > SAL_CALL
432 Collator_Unicode::getSupportedServiceNames()
433 {
434     Sequence< OUString > aRet { OUString(implementationName) };
435     return aRet;
436 }
437 
438 }
439 
440 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
441