xref: /core/i18nutil/source/utility/unicode.cxx (revision c3c620c8)
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <com/sun/star/i18n/UnicodeType.hpp>
21 #include <com/sun/star/i18n/ScriptType.hpp>
22 #include <i18nlangtag/languagetag.hxx>
23 #include <i18nlangtag/languagetagicu.hxx>
24 #include <i18nutil/unicode.hxx>
25 #include <sal/log.hxx>
26 #include <unicode/numfmt.h>
27 #include "unicode_data.h"
28 #include <rtl/character.hxx>
29 #include <memory>
30 
31 // Workaround for glibc braindamage:
32 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
33 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
34 #undef CURRENCY_SYMBOL
35 
36 using namespace ::com::sun::star::i18n;
37 
38 template<class L, typename T>
39 static T getScriptType( const sal_Unicode ch, const L* typeList, T unknownType ) {
40 
41     sal_Int16 i = 0;
42     css::i18n::UnicodeScript type = typeList[0].to;
43     while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo]) {
44         type = typeList[++i].to;
45     }
46 
47     return (type < UnicodeScript_kScriptCount &&
48             ch >= UnicodeScriptType[static_cast<int>(typeList[i].from)][int(UnicodeScriptTypeFrom)]) ?
49             typeList[i].value : unknownType;
50 }
51 
52 sal_Int16
53 unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
54     return getScriptType(ch, typeList, unknownType);
55 }
56 
57 sal_Unicode
58 unicode::getUnicodeScriptStart( UnicodeScript type) {
59     return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeFrom];
60 }
61 
62 sal_Unicode
63 unicode::getUnicodeScriptEnd( UnicodeScript type) {
64     return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo];
65 }
66 
67 sal_Int16
68 unicode::getUnicodeType( const sal_Unicode ch ) {
69     static sal_Unicode c = 0x00;
70     static sal_Int16 r = 0x00;
71 
72     if (ch == c) return r;
73     else c = ch;
74 
75     sal_Int16 address = UnicodeTypeIndex[ch >> 8];
76     return r = static_cast<sal_Int16>((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] :
77         UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
78 }
79 
80 sal_uInt8
81 unicode::getUnicodeDirection( const sal_Unicode ch ) {
82     static sal_Unicode c = 0x00;
83     static sal_uInt8 r = 0x00;
84 
85     if (ch == c) return r;
86     else c = ch;
87 
88     sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
89     return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] :
90         UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]);
91 
92 }
93 
94 #define bit(name)   (1U << name)
95 
96 #define UPPERMASK   bit(UnicodeType::UPPERCASE_LETTER)
97 
98 #define LOWERMASK   bit(UnicodeType::LOWERCASE_LETTER)
99 
100 #define TITLEMASK   bit(UnicodeType::TITLECASE_LETTER)
101 
102 #define ALPHAMASK   UPPERMASK|LOWERMASK|TITLEMASK|\
103             bit(UnicodeType::MODIFIER_LETTER)|\
104             bit(UnicodeType::OTHER_LETTER)
105 
106 #define SPACEMASK   bit(UnicodeType::SPACE_SEPARATOR)|\
107             bit(UnicodeType::LINE_SEPARATOR)|\
108             bit(UnicodeType::PARAGRAPH_SEPARATOR)
109 
110 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
111             bit(UnicodeType::FORMAT)|\
112             bit(UnicodeType::LINE_SEPARATOR)|\
113             bit(UnicodeType::PARAGRAPH_SEPARATOR)
114 
115 #define IsType(func, mask)  \
116 bool func( const sal_Unicode ch) {\
117     return (bit(getUnicodeType(ch)) & (mask)) != 0;\
118 }
119 
120 IsType(unicode::isControl, CONTROLMASK)
121 IsType(unicode::isAlpha, ALPHAMASK)
122 IsType(unicode::isSpace, SPACEMASK)
123 
124 #define CONTROLSPACE    bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
125             bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
126 
127 bool unicode::isWhiteSpace( const sal_Unicode ch) {
128     return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
129 }
130 
131 sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
132 {
133     //See unicode/uscript.h
134     static const sal_Int16 scriptTypes[] =
135     {
136         ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX,
137         ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
138         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN,
139     // 15
140         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX,
141         ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX,
142         ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
143     // 30
144         ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
145         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
146         ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
147     // 45
148         ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
149         ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
150         ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
151     // 60
152         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
153         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
154         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN,
155     // 75
156         ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
157         ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
158         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
159     // 90
160         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
161         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
162         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX,
163     // 105
164         ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
165         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
166         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN,
167     // 120
168         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
169         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK,
170         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
171     // 135
172         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
173         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
174         ScriptType::COMPLEX,
175         ScriptType::WEAK
176     };
177 
178     sal_Int16 nRet;
179     if (eScript < USCRIPT_COMMON)
180         nRet = ScriptType::WEAK;
181     else if (static_cast<size_t>(eScript) >= SAL_N_ELEMENTS(scriptTypes))
182         nRet = ScriptType::COMPLEX;         // anything new is going to be pretty wild
183     else
184         nRet = scriptTypes[eScript];
185     return nRet;
186 }
187 
188 OString unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
189 {
190     OString sRet;
191     switch (eScript)
192     {
193         case USCRIPT_CODE_LIMIT:
194         case USCRIPT_INVALID_CODE:
195             sRet = "zxx";
196             break;
197         case USCRIPT_COMMON:
198         case USCRIPT_INHERITED:
199             sRet = "und";
200             break;
201         case USCRIPT_MATHEMATICAL_NOTATION:
202         case USCRIPT_SYMBOLS:
203             sRet = "zxx";
204             break;
205         case USCRIPT_UNWRITTEN_LANGUAGES:
206         case USCRIPT_UNKNOWN:
207             sRet = "und";
208             break;
209         case USCRIPT_ARABIC:
210             sRet = "ar";
211             break;
212         case USCRIPT_ARMENIAN:
213             sRet = "hy";
214             break;
215         case USCRIPT_BENGALI:
216             sRet = "bn";
217             break;
218         case USCRIPT_BOPOMOFO:
219             sRet = "zh";
220             break;
221         case USCRIPT_CHEROKEE:
222             sRet = "chr";
223             break;
224         case USCRIPT_COPTIC:
225             sRet = "cop";
226             break;
227         case USCRIPT_CYRILLIC:
228             sRet = "ru";
229             break;
230         case USCRIPT_DESERET:
231             sRet = "en";
232             break;
233         case USCRIPT_DEVANAGARI:
234             sRet = "hi";
235             break;
236         case USCRIPT_ETHIOPIC:
237             sRet = "am";
238             break;
239         case USCRIPT_GEORGIAN:
240             sRet = "ka";
241             break;
242         case USCRIPT_GOTHIC:
243             sRet = "got";
244             break;
245         case USCRIPT_GREEK:
246             sRet = "el";
247             break;
248         case USCRIPT_GUJARATI:
249             sRet = "gu";
250             break;
251         case USCRIPT_GURMUKHI:
252             sRet = "pa";
253             break;
254         case USCRIPT_HAN:
255             sRet = "zh";
256             break;
257         case USCRIPT_HANGUL:
258             sRet = "ko";
259             break;
260         case USCRIPT_HEBREW:
261             sRet = "hr";
262             break;
263         case USCRIPT_HIRAGANA:
264             sRet = "ja";
265             break;
266         case USCRIPT_KANNADA:
267             sRet = "kn";
268             break;
269         case USCRIPT_KATAKANA:
270             sRet = "ja";
271             break;
272         case USCRIPT_KHMER:
273             sRet = "km";
274             break;
275         case USCRIPT_LAO:
276             sRet = "lo";
277             break;
278         case USCRIPT_LATIN:
279             sRet = "en";
280             break;
281         case USCRIPT_MALAYALAM:
282             sRet = "ml";
283             break;
284         case USCRIPT_MONGOLIAN:
285             sRet = "mn";
286             break;
287         case USCRIPT_MYANMAR:
288             sRet = "my";
289             break;
290         case USCRIPT_OGHAM:
291             sRet = "pgl";
292             break;
293         case USCRIPT_OLD_ITALIC:
294             sRet = "osc";
295             break;
296         case USCRIPT_ORIYA:
297             sRet = "or";
298             break;
299         case USCRIPT_RUNIC:
300             sRet = "ang";
301             break;
302         case USCRIPT_SINHALA:
303             sRet = "si";
304             break;
305         case USCRIPT_SYRIAC:
306             sRet = "syr";
307             break;
308         case USCRIPT_TAMIL:
309             sRet = "ta";
310             break;
311         case USCRIPT_TELUGU:
312             sRet = "te";
313             break;
314         case USCRIPT_THAANA:
315             sRet = "dv";
316             break;
317         case USCRIPT_THAI:
318             sRet = "th";
319             break;
320         case USCRIPT_TIBETAN:
321             sRet = "bo";
322             break;
323         case USCRIPT_CANADIAN_ABORIGINAL:
324             sRet = "iu";
325             break;
326         case USCRIPT_YI:
327             sRet = "ii";
328             break;
329         case USCRIPT_TAGALOG:
330             sRet = "tl";
331             break;
332         case USCRIPT_HANUNOO:
333             sRet = "hnn";
334             break;
335         case USCRIPT_BUHID:
336             sRet = "bku";
337             break;
338         case USCRIPT_TAGBANWA:
339             sRet = "tbw";
340             break;
341         case USCRIPT_BRAILLE:
342             sRet = "en";
343             break;
344         case USCRIPT_CYPRIOT:
345             sRet = "ecy";
346             break;
347         case USCRIPT_LIMBU:
348             sRet = "lif";
349             break;
350         case USCRIPT_LINEAR_B:
351             sRet = "gmy";
352             break;
353         case USCRIPT_OSMANYA:
354             sRet = "so";
355             break;
356         case USCRIPT_SHAVIAN:
357             sRet = "en";
358             break;
359         case USCRIPT_TAI_LE:
360             sRet = "tdd";
361             break;
362         case USCRIPT_UGARITIC:
363             sRet = "uga";
364             break;
365         case USCRIPT_KATAKANA_OR_HIRAGANA:
366             sRet = "ja";
367             break;
368         case USCRIPT_BUGINESE:
369             sRet = "bug";
370             break;
371         case USCRIPT_GLAGOLITIC:
372             sRet = "ch";
373             break;
374         case USCRIPT_KHAROSHTHI:
375             sRet = "pra";
376             break;
377         case USCRIPT_SYLOTI_NAGRI:
378             sRet = "syl";
379             break;
380         case USCRIPT_NEW_TAI_LUE:
381             sRet = "khb";
382             break;
383         case USCRIPT_TIFINAGH:
384             sRet = "tmh";
385             break;
386         case USCRIPT_OLD_PERSIAN:
387             sRet = "peo";
388             break;
389         case USCRIPT_BALINESE:
390             sRet = "ban";
391             break;
392         case USCRIPT_BATAK:
393             sRet = "btk";
394             break;
395         case USCRIPT_BLISSYMBOLS:
396             sRet = "en";
397             break;
398         case USCRIPT_BRAHMI:
399             sRet = "pra";
400             break;
401         case USCRIPT_CHAM:
402             sRet = "cja";
403             break;
404         case USCRIPT_CIRTH:
405             sRet = "sjn";
406             break;
407         case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
408             sRet = "cu";
409             break;
410         case USCRIPT_DEMOTIC_EGYPTIAN:
411         case USCRIPT_HIERATIC_EGYPTIAN:
412         case USCRIPT_EGYPTIAN_HIEROGLYPHS:
413             sRet = "egy";
414             break;
415         case USCRIPT_KHUTSURI:
416             sRet = "ka";
417             break;
418         case USCRIPT_SIMPLIFIED_HAN:
419             sRet = "zh";
420             break;
421         case USCRIPT_TRADITIONAL_HAN:
422             sRet = "zh";
423             break;
424         case USCRIPT_PAHAWH_HMONG:
425             sRet = "blu";
426             break;
427         case USCRIPT_OLD_HUNGARIAN:
428             sRet = "ohu";
429             break;
430         case USCRIPT_HARAPPAN_INDUS:
431             sRet = "xiv";
432             break;
433         case USCRIPT_JAVANESE:
434             sRet = "kaw";
435             break;
436         case USCRIPT_KAYAH_LI:
437             sRet = "eky";
438             break;
439         case USCRIPT_LATIN_FRAKTUR:
440             sRet = "de";
441             break;
442         case USCRIPT_LATIN_GAELIC:
443             sRet = "ga";
444             break;
445         case USCRIPT_LEPCHA:
446             sRet = "lep";
447             break;
448         case USCRIPT_LINEAR_A:
449             sRet = "ecr";
450             break;
451         case USCRIPT_MAYAN_HIEROGLYPHS:
452             sRet = "myn";
453             break;
454         case USCRIPT_MEROITIC:
455             sRet = "xmr";
456             break;
457         case USCRIPT_NKO:
458             sRet = "nqo";
459             break;
460         case USCRIPT_ORKHON:
461             sRet = "otk";
462             break;
463         case USCRIPT_OLD_PERMIC:
464             sRet = "kv";
465             break;
466         case USCRIPT_PHAGS_PA:
467             sRet = "xng";
468             break;
469         case USCRIPT_PHOENICIAN:
470             sRet = "phn";
471             break;
472         case USCRIPT_PHONETIC_POLLARD:
473             sRet = "hmd";
474             break;
475         case USCRIPT_RONGORONGO:
476             sRet = "rap";
477             break;
478         case USCRIPT_SARATI:
479             sRet = "qya";
480             break;
481         case USCRIPT_ESTRANGELO_SYRIAC:
482             sRet = "syr";
483             break;
484         case USCRIPT_WESTERN_SYRIAC:
485             sRet = "tru";
486             break;
487         case USCRIPT_EASTERN_SYRIAC:
488             sRet = "aii";
489             break;
490         case USCRIPT_TENGWAR:
491             sRet = "sjn";
492             break;
493         case USCRIPT_VAI:
494             sRet = "vai";
495             break;
496         case USCRIPT_VISIBLE_SPEECH:
497             sRet = "en";
498             break;
499         case USCRIPT_CUNEIFORM:
500             sRet = "akk";
501             break;
502         case USCRIPT_CARIAN:
503             sRet = "xcr";
504             break;
505         case USCRIPT_JAPANESE:
506             sRet = "ja";
507             break;
508         case USCRIPT_LANNA:
509             sRet = "nod";
510             break;
511         case USCRIPT_LYCIAN:
512             sRet = "xlc";
513             break;
514         case USCRIPT_LYDIAN:
515             sRet = "xld";
516             break;
517         case USCRIPT_OL_CHIKI:
518             sRet = "sat";
519             break;
520         case USCRIPT_REJANG:
521             sRet = "rej";
522             break;
523         case USCRIPT_SAURASHTRA:
524             sRet = "saz";
525             break;
526         case USCRIPT_SIGN_WRITING:
527             sRet = "en";
528             break;
529         case USCRIPT_SUNDANESE:
530             sRet = "su";
531             break;
532         case USCRIPT_MOON:
533             sRet = "en";
534             break;
535         case USCRIPT_MEITEI_MAYEK:
536             sRet = "mni";
537             break;
538         case USCRIPT_IMPERIAL_ARAMAIC:
539             sRet = "arc";
540             break;
541         case USCRIPT_AVESTAN:
542             sRet = "ae";
543             break;
544         case USCRIPT_CHAKMA:
545             sRet = "ccp";
546             break;
547         case USCRIPT_KOREAN:
548             sRet = "ko";
549             break;
550         case USCRIPT_KAITHI:
551             sRet = "awa";
552             break;
553         case USCRIPT_MANICHAEAN:
554             sRet = "xmn";
555             break;
556         case USCRIPT_INSCRIPTIONAL_PAHLAVI:
557         case USCRIPT_PSALTER_PAHLAVI:
558         case USCRIPT_BOOK_PAHLAVI:
559         case USCRIPT_INSCRIPTIONAL_PARTHIAN:
560             sRet = "xpr";
561             break;
562         case USCRIPT_SAMARITAN:
563             sRet = "heb";
564             break;
565         case USCRIPT_TAI_VIET:
566             sRet = "blt";
567             break;
568         case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
569             sRet = "mic";
570             break;
571 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 4)
572         case USCRIPT_NABATAEAN: //no language with an assigned code yet
573             sRet = "mis";
574             break;
575         case USCRIPT_PALMYRENE: //no language with an assigned code yet
576             sRet = "mis";
577             break;
578         case USCRIPT_BAMUM:
579             sRet = "bax";
580             break;
581         case USCRIPT_LISU:
582             sRet = "lis";
583             break;
584         case USCRIPT_NAKHI_GEBA:
585             sRet = "nxq";
586             break;
587         case USCRIPT_OLD_SOUTH_ARABIAN:
588             sRet = "xsa";
589             break;
590         case USCRIPT_BASSA_VAH:
591             sRet = "bsq";
592             break;
593         case USCRIPT_DUPLOYAN_SHORTAND:
594             sRet = "fr";
595             break;
596         case USCRIPT_ELBASAN:
597             sRet = "sq";
598             break;
599         case USCRIPT_GRANTHA:
600             sRet = "ta";
601             break;
602         case USCRIPT_KPELLE:
603             sRet = "kpe";
604             break;
605         case USCRIPT_LOMA:
606             sRet = "lom";
607             break;
608         case USCRIPT_MENDE:
609             sRet = "men";
610             break;
611         case USCRIPT_MEROITIC_CURSIVE:
612             sRet = "xmr";
613             break;
614         case USCRIPT_OLD_NORTH_ARABIAN:
615             sRet = "xna";
616             break;
617         case USCRIPT_SINDHI:
618             sRet = "sd";
619             break;
620         case USCRIPT_WARANG_CITI:
621             sRet = "hoc";
622             break;
623 #endif
624 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
625         case USCRIPT_AFAKA:
626             sRet = "djk";
627             break;
628         case USCRIPT_JURCHEN:
629             sRet = "juc";
630             break;
631         case USCRIPT_MRO:
632             sRet = "cmr";
633             break;
634         case USCRIPT_NUSHU: //no language with an assigned code yet
635             sRet = "mis";
636             break;
637         case USCRIPT_SHARADA:
638             sRet = "sa";
639             break;
640         case USCRIPT_SORA_SOMPENG:
641             sRet = "srb";
642             break;
643         case USCRIPT_TAKRI:
644             sRet = "doi";
645             break;
646         case USCRIPT_TANGUT:
647             sRet = "txg";
648             break;
649         case USCRIPT_WOLEAI:
650             sRet = "woe";
651             break;
652 #endif
653 #if (U_ICU_VERSION_MAJOR_NUM >= 49)
654         case USCRIPT_ANATOLIAN_HIEROGLYPHS:
655             sRet = "hlu";
656             break;
657         case USCRIPT_KHOJKI:
658             sRet = "gu";
659             break;
660         case USCRIPT_TIRHUTA:
661             sRet = "mai";
662             break;
663 #endif
664 #if (U_ICU_VERSION_MAJOR_NUM >= 52)
665         case USCRIPT_CAUCASIAN_ALBANIAN:
666             sRet = "xag";
667             break;
668         case USCRIPT_MAHAJANI:
669             sRet = "mwr";
670             break;
671 #endif
672 #if (U_ICU_VERSION_MAJOR_NUM >= 54)
673         case USCRIPT_AHOM:
674             sRet = "aho";
675             break;
676         case USCRIPT_HATRAN:
677             sRet = "qly-Hatr";
678             break;
679         case USCRIPT_MODI:
680             sRet = "mr-Modi";
681             break;
682         case USCRIPT_MULTANI:
683             sRet = "skr-Mutl";
684             break;
685         case USCRIPT_PAU_CIN_HAU:
686             sRet = "ctd-Pauc";
687             break;
688         case USCRIPT_SIDDHAM:
689             sRet = "sa-Sidd";
690             break;
691 #endif
692 #if (U_ICU_VERSION_MAJOR_NUM >= 58)
693         case USCRIPT_ADLAM:
694             sRet = "mis";   // Adlm - Adlam for Fulani, no language code
695             break;
696         case USCRIPT_BHAIKSUKI:
697             sRet = "mis";   // Bhks - Bhaiksuki for some Buddhist texts, no language code
698             break;
699         case USCRIPT_MARCHEN:
700             sRet = "bo-Marc";
701             break;
702         case USCRIPT_NEWA:
703             sRet = "new-Newa";
704             break;
705         case USCRIPT_OSAGE:
706             sRet = "osa-Osge";
707             break;
708         case USCRIPT_HAN_WITH_BOPOMOFO:
709             sRet = "mis";   // Hanb - Han with Bopomofo, zh-Hanb ?
710             break;
711         case USCRIPT_JAMO:
712             sRet = "mis";   // Jamo - Jamo subset of Hangul, ko-Jamo ?
713             break;
714         case USCRIPT_SYMBOLS_EMOJI:
715             sRet = "mis";   // Zsye - Emoji variant
716             break;
717 #endif
718 #if (U_ICU_VERSION_MAJOR_NUM >= 60)
719         case USCRIPT_MASARAM_GONDI:
720             sRet = "gon-Gonm";  // macro language code, could be wsg,esg,gno
721             break;
722         case USCRIPT_SOYOMBO:
723             sRet = "mn-Soyo";   // abugida to write Mongolian, also Tibetan and Sanskrit
724             break;
725         case USCRIPT_ZANABAZAR_SQUARE:
726             sRet = "mn-Zanb";   // abugida to write Mongolian
727             break;
728 #endif
729 #if (U_ICU_VERSION_MAJOR_NUM >= 62)
730         case USCRIPT_DOGRA:
731             sRet = "dgo";       // Dogri proper
732             break;
733         case USCRIPT_GUNJALA_GONDI:
734             sRet = "wsg";       // Adilabad Gondi
735             break;
736         case USCRIPT_MAKASAR:
737             sRet = "mak";
738             break;
739         case USCRIPT_MEDEFAIDRIN:
740             sRet = "mis-Medf";  // Uncoded with script
741             break;
742         case USCRIPT_HANIFI_ROHINGYA:
743             sRet = "rhg";
744             break;
745         case USCRIPT_SOGDIAN:
746             sRet = "sog";
747             break;
748         case USCRIPT_OLD_SOGDIAN:
749             sRet = "sog";
750             break;
751 #endif
752     }
753     return sRet;
754 }
755 
756 //Format a number as a percentage according to the rules of the given
757 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
758 OUString unicode::formatPercent(double dNumber,
759     const LanguageTag &rLangTag)
760 {
761     // get a currency formatter for this locale ID
762     UErrorCode errorCode=U_ZERO_ERROR;
763 
764     LanguageTag aLangTag(rLangTag);
765 
766     // As of CLDR Version 24 these languages were not listed as using spacing
767     // between number and % but are reported as such by our l10n groups
768     // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
769     // so format using French which has the desired rules
770     if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
771         aLangTag.reset("fr-FR");
772 
773     icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);
774 
775     std::unique_ptr<icu::NumberFormat> xF(
776         icu::NumberFormat::createPercentInstance(aLocale, errorCode));
777     if(U_FAILURE(errorCode))
778     {
779         SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed");
780         return OUString::number(dNumber) + "%";
781     }
782 
783     icu::UnicodeString output;
784     xF->format(dNumber/100, output);
785     OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
786         output.length());
787     if (rLangTag.getLanguage() == "de")
788     {
789         //narrow no-break space instead of (normal) no-break space
790         return aRet.replace(0x00A0, 0x202F);
791     }
792     return aRet;
793 }
794 
795 bool ToggleUnicodeCodepoint::AllowMoreInput(sal_Unicode uChar)
796 {
797     //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
798     if( maInput.getLength() > 255 )
799         mbAllowMoreChars = false;
800 
801     if( !mbAllowMoreChars )
802         return false;
803 
804     bool bPreventNonHex = false;
805     if( maInput.indexOf("U+") != -1 )
806         bPreventNonHex = true;
807 
808     switch ( unicode::getUnicodeType(uChar) )
809     {
810         case css::i18n::UnicodeType::SURROGATE:
811             if( bPreventNonHex )
812             {
813                 mbAllowMoreChars = false;
814                 return false;
815             }
816 
817             if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty()  )
818             {
819                 maUtf16.append(uChar);
820                 return true;
821             }
822             if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
823                 maUtf16.insert(0, uChar );
824             //end of hex strings, or unexpected order of high/low, so don't accept more
825             if( !maUtf16.isEmpty() )
826                 maInput.append(maUtf16);
827             if( !maCombining.isEmpty() )
828                 maInput.append(maCombining);
829             mbAllowMoreChars = false;
830             break;
831 
832         case css::i18n::UnicodeType::NON_SPACING_MARK:
833         case css::i18n::UnicodeType::COMBINING_SPACING_MARK:
834             if( bPreventNonHex )
835             {
836                 mbAllowMoreChars = false;
837                 return false;
838             }
839 
840             //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
841             if( !maUtf16.isEmpty() )
842             {
843                 maInput = maUtf16;
844                 if( !maCombining.isEmpty() )
845                     maInput.append(maCombining);
846                 mbAllowMoreChars = false;
847                 return false;
848             }
849             maCombining.insert(0, uChar);
850             break;
851 
852         default:
853             //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
854             if( !maUtf16.isEmpty() )
855             {
856                 maInput = maUtf16;
857                 if( !maCombining.isEmpty() )
858                     maInput.append(maCombining);
859                 mbAllowMoreChars = false;
860                 return false;
861             }
862 
863             if( !maCombining.isEmpty() )
864             {
865                 maCombining.insert(0, uChar);
866                 maInput = maCombining;
867                 mbAllowMoreChars = false;
868                 return false;
869             }
870 
871             // 0 - 1f are control characters.  Do not process those.
872             if( uChar < 0x20 )
873             {
874                 mbAllowMoreChars = false;
875                 return false;
876             }
877 
878             switch( uChar )
879             {
880                 case 'u':
881                 case 'U':
882                     // U+ notation found.  Continue looking for another one.
883                     if( mbRequiresU )
884                     {
885                         mbRequiresU = false;
886                         maInput.insert(0,"U+");
887                     }
888                     // treat as a normal character
889                     else
890                     {
891                         mbAllowMoreChars = false;
892                         if( !bPreventNonHex )
893                             maInput.insertUtf32(0, uChar);
894                     }
895                     break;
896                 case '+':
897                     // + already found: skip when not U, or edge case of +U+xxxx
898                     if( mbRequiresU || (maInput.indexOf("U+") == 0) )
899                         mbAllowMoreChars = false;
900                     // hex chars followed by '+' - now require a 'U'
901                     else if ( !maInput.isEmpty() )
902                         mbRequiresU = true;
903                     // treat as a normal character
904                     else
905                     {
906                         mbAllowMoreChars = false;
907                         if( !bPreventNonHex )
908                             maInput.insertUtf32(0, uChar);
909                     }
910                     break;
911                 default:
912                     // + already found. Since not U, cancel further input
913                     if( mbRequiresU )
914                         mbAllowMoreChars = false;
915                     // maximum digits per notation is 8: only one notation
916                     else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 )
917                         mbAllowMoreChars = false;
918                     // maximum digits per notation is 8: previous notation found
919                     else if( maInput.indexOf("U+") == 8 )
920                         mbAllowMoreChars = false;
921                     // a hex character. Add to string.
922                     else if( rtl::isAsciiHexDigit(uChar) )
923                     {
924                         mbIsHexString = true;
925                         maInput.insertUtf32(0, uChar);
926                     }
927                     // not a hex character: stop input. keep if it is the first input provided
928                     else
929                     {
930                         mbAllowMoreChars = false;
931                         if( maInput.isEmpty() )
932                             maInput.insertUtf32(0, uChar);
933                     }
934             }
935     }
936     return mbAllowMoreChars;
937 }
938 
939 OUString ToggleUnicodeCodepoint::StringToReplace()
940 {
941     if( maInput.isEmpty() )
942     {
943         //edge case - input finished with incomplete low surrogate or combining characters without a base
944         if( mbAllowMoreChars )
945         {
946             if( !maUtf16.isEmpty() )
947                 maInput = maUtf16;
948             if( !maCombining.isEmpty() )
949                 maInput.append(maCombining);
950         }
951         return maInput.toString();
952     }
953 
954     if( !mbIsHexString )
955         return maInput.toString();
956 
957     //this function potentially modifies the input string.  Prevent addition of further characters
958     mbAllowMoreChars = false;
959 
960     //validate unicode notation.
961     OUString sIn;
962     sal_uInt32 nUnicode = 0;
963     sal_Int32 nUPlus = maInput.indexOf("U+");
964     //if U+ notation used, strip off all extra chars added not in U+ notation
965     if( nUPlus != -1 )
966     {
967         maInput.remove(0, nUPlus);
968         sIn = maInput.copy(2).toString();
969         nUPlus = sIn.indexOf("U+");
970     }
971     else
972         sIn = maInput.toString();
973     while( nUPlus != -1 )
974     {
975         nUnicode = sIn.copy(0, nUPlus).toUInt32(16);
976         //prevent creating control characters or invalid Unicode values
977         if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20  )
978             maInput = sIn.copy(nUPlus);
979         sIn = sIn.copy(nUPlus+2);
980         nUPlus =  sIn.indexOf("U+");
981     }
982 
983     nUnicode = sIn.toUInt32(16);
984     if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
985        maInput.truncate().append( sIn[sIn.getLength()-1] );
986     return maInput.toString();
987 }
988 
989 sal_uInt32 ToggleUnicodeCodepoint::CharsToDelete()
990 {
991     OUString sIn = StringToReplace();
992     sal_Int32 nPos = 0;
993     sal_uInt32 counter = 0;
994     while( nPos < sIn.getLength() )
995     {
996         sIn.iterateCodePoints(&nPos);
997         ++counter;
998     }
999     return counter;
1000 }
1001 
1002 OUString ToggleUnicodeCodepoint::ReplacementString()
1003 {
1004     OUString sIn = StringToReplace();
1005     OUStringBuffer output = "";
1006     sal_Int32 nUPlus = sIn.indexOf("U+");
1007     // convert from hex notation to glyph
1008     if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) )
1009     {
1010         sal_uInt32 nUnicode = 0;
1011         if( nUPlus == 0)
1012         {
1013             sIn = sIn.copy(2);
1014             nUPlus = sIn.indexOf("U+");
1015         }
1016         while( nUPlus > 0 )
1017         {
1018             nUnicode = sIn.copy(0, nUPlus).toUInt32(16);
1019             output.appendUtf32( nUnicode );
1020 
1021             sIn = sIn.copy(nUPlus+2);
1022             nUPlus = sIn.indexOf("U+");
1023         }
1024         nUnicode = sIn.toUInt32(16);
1025         output.appendUtf32( nUnicode );
1026     }
1027     // convert from glyph to hex notation
1028     else
1029     {
1030         sal_Int32 nPos = 0;
1031         while( nPos < sIn.getLength() )
1032         {
1033             OUStringBuffer aTmp = OUString::number(sIn.iterateCodePoints(&nPos),16);
1034             //pad with zeros - minimum length of 4.
1035             for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i )
1036                 aTmp.insert( 0,"0" );
1037             output.append( "U+" );
1038             output.append( aTmp );
1039         }
1040     }
1041     return output.toString();
1042 }
1043 
1044 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
1045