1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 /* 3 * This file is part of the LibreOffice project. 4 * 5 * This Source Code Form is subject to the terms of the Mozilla Public 6 * License, v. 2.0. If a copy of the MPL was not distributed with this 7 * file, You can obtain one at http://mozilla.org/MPL/2.0/. 8 * 9 * This file incorporates work covered by the following license notice: 10 * 11 * Licensed to the Apache Software Foundation (ASF) under one or more 12 * contributor license agreements. See the NOTICE file distributed 13 * with this work for additional information regarding copyright 14 * ownership. The ASF licenses this file to you under the Apache 15 * License, Version 2.0 (the "License"); you may not use this file 16 * except in compliance with the License. You may obtain a copy of 17 * the License at http://www.apache.org/licenses/LICENSE-2.0 . 18 */ 19 20 #include <com/sun/star/i18n/UnicodeType.hpp> 21 #include <com/sun/star/i18n/ScriptType.hpp> 22 #include <i18nlangtag/languagetag.hxx> 23 #include <i18nlangtag/languagetagicu.hxx> 24 #include <i18nutil/unicode.hxx> 25 #include <sal/log.hxx> 26 #include <unicode/numfmt.h> 27 #include "unicode_data.h" 28 #include <rtl/character.hxx> 29 #include <memory> 30 31 // Workaround for glibc braindamage: 32 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL" 33 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL 34 #undef CURRENCY_SYMBOL 35 36 using namespace ::com::sun::star::i18n; 37 38 template<class L, typename T> 39 static T getScriptType( const sal_Unicode ch, const L* typeList, T unknownType ) { 40 41 sal_Int16 i = 0; 42 css::i18n::UnicodeScript type = typeList[0].to; 43 while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo]) { 44 type = typeList[++i].to; 45 } 46 47 return (type < UnicodeScript_kScriptCount && 48 ch >= UnicodeScriptType[static_cast<int>(typeList[i].from)][int(UnicodeScriptTypeFrom)]) ? 49 typeList[i].value : unknownType; 50 } 51 52 sal_Int16 53 unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) { 54 return getScriptType(ch, typeList, unknownType); 55 } 56 57 sal_Unicode 58 unicode::getUnicodeScriptStart( UnicodeScript type) { 59 return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeFrom]; 60 } 61 62 sal_Unicode 63 unicode::getUnicodeScriptEnd( UnicodeScript type) { 64 return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo]; 65 } 66 67 sal_Int16 68 unicode::getUnicodeType( const sal_Unicode ch ) { 69 static sal_Unicode c = 0x00; 70 static sal_Int16 r = 0x00; 71 72 if (ch == c) return r; 73 else c = ch; 74 75 sal_Int16 address = UnicodeTypeIndex[ch >> 8]; 76 return r = static_cast<sal_Int16>((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] : 77 UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]); 78 } 79 80 sal_uInt8 81 unicode::getUnicodeDirection( const sal_Unicode ch ) { 82 static sal_Unicode c = 0x00; 83 static sal_uInt8 r = 0x00; 84 85 if (ch == c) return r; 86 else c = ch; 87 88 sal_Int16 address = UnicodeDirectionIndex[ch >> 8]; 89 return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] : 90 UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]); 91 92 } 93 94 #define bit(name) (1U << name) 95 96 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER) 97 98 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER) 99 100 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER) 101 102 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\ 103 bit(UnicodeType::MODIFIER_LETTER)|\ 104 bit(UnicodeType::OTHER_LETTER) 105 106 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\ 107 bit(UnicodeType::LINE_SEPARATOR)|\ 108 bit(UnicodeType::PARAGRAPH_SEPARATOR) 109 110 #define CONTROLMASK bit(UnicodeType::CONTROL)|\ 111 bit(UnicodeType::FORMAT)|\ 112 bit(UnicodeType::LINE_SEPARATOR)|\ 113 bit(UnicodeType::PARAGRAPH_SEPARATOR) 114 115 #define IsType(func, mask) \ 116 bool func( const sal_Unicode ch) {\ 117 return (bit(getUnicodeType(ch)) & (mask)) != 0;\ 118 } 119 120 IsType(unicode::isControl, CONTROLMASK) 121 IsType(unicode::isAlpha, ALPHAMASK) 122 IsType(unicode::isSpace, SPACEMASK) 123 124 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\ 125 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f) 126 127 bool unicode::isWhiteSpace( const sal_Unicode ch) { 128 return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE))); 129 } 130 131 sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript) 132 { 133 //See unicode/uscript.h 134 static const sal_Int16 scriptTypes[] = 135 { 136 ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, 137 ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, 138 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, 139 // 15 140 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX, 141 ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, 142 ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, 143 // 30 144 ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, 145 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 146 ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 147 // 45 148 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, 149 ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, 150 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 151 // 60 152 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 153 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, 154 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, 155 // 75 156 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 157 ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 158 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 159 // 90 160 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 161 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 162 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, 163 // 105 164 ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 165 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 166 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, 167 // 120 168 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 169 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, 170 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 171 // 135 172 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 173 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, 174 ScriptType::COMPLEX, 175 ScriptType::WEAK 176 }; 177 178 sal_Int16 nRet; 179 if (eScript < USCRIPT_COMMON) 180 nRet = ScriptType::WEAK; 181 else if (static_cast<size_t>(eScript) >= SAL_N_ELEMENTS(scriptTypes)) 182 nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild 183 else 184 nRet = scriptTypes[eScript]; 185 return nRet; 186 } 187 188 OString unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript) 189 { 190 OString sRet; 191 switch (eScript) 192 { 193 case USCRIPT_CODE_LIMIT: 194 case USCRIPT_INVALID_CODE: 195 sRet = "zxx"; 196 break; 197 case USCRIPT_COMMON: 198 case USCRIPT_INHERITED: 199 sRet = "und"; 200 break; 201 case USCRIPT_MATHEMATICAL_NOTATION: 202 case USCRIPT_SYMBOLS: 203 sRet = "zxx"; 204 break; 205 case USCRIPT_UNWRITTEN_LANGUAGES: 206 case USCRIPT_UNKNOWN: 207 sRet = "und"; 208 break; 209 case USCRIPT_ARABIC: 210 sRet = "ar"; 211 break; 212 case USCRIPT_ARMENIAN: 213 sRet = "hy"; 214 break; 215 case USCRIPT_BENGALI: 216 sRet = "bn"; 217 break; 218 case USCRIPT_BOPOMOFO: 219 sRet = "zh"; 220 break; 221 case USCRIPT_CHEROKEE: 222 sRet = "chr"; 223 break; 224 case USCRIPT_COPTIC: 225 sRet = "cop"; 226 break; 227 case USCRIPT_CYRILLIC: 228 sRet = "ru"; 229 break; 230 case USCRIPT_DESERET: 231 sRet = "en"; 232 break; 233 case USCRIPT_DEVANAGARI: 234 sRet = "hi"; 235 break; 236 case USCRIPT_ETHIOPIC: 237 sRet = "am"; 238 break; 239 case USCRIPT_GEORGIAN: 240 sRet = "ka"; 241 break; 242 case USCRIPT_GOTHIC: 243 sRet = "got"; 244 break; 245 case USCRIPT_GREEK: 246 sRet = "el"; 247 break; 248 case USCRIPT_GUJARATI: 249 sRet = "gu"; 250 break; 251 case USCRIPT_GURMUKHI: 252 sRet = "pa"; 253 break; 254 case USCRIPT_HAN: 255 sRet = "zh"; 256 break; 257 case USCRIPT_HANGUL: 258 sRet = "ko"; 259 break; 260 case USCRIPT_HEBREW: 261 sRet = "hr"; 262 break; 263 case USCRIPT_HIRAGANA: 264 sRet = "ja"; 265 break; 266 case USCRIPT_KANNADA: 267 sRet = "kn"; 268 break; 269 case USCRIPT_KATAKANA: 270 sRet = "ja"; 271 break; 272 case USCRIPT_KHMER: 273 sRet = "km"; 274 break; 275 case USCRIPT_LAO: 276 sRet = "lo"; 277 break; 278 case USCRIPT_LATIN: 279 sRet = "en"; 280 break; 281 case USCRIPT_MALAYALAM: 282 sRet = "ml"; 283 break; 284 case USCRIPT_MONGOLIAN: 285 sRet = "mn"; 286 break; 287 case USCRIPT_MYANMAR: 288 sRet = "my"; 289 break; 290 case USCRIPT_OGHAM: 291 sRet = "pgl"; 292 break; 293 case USCRIPT_OLD_ITALIC: 294 sRet = "osc"; 295 break; 296 case USCRIPT_ORIYA: 297 sRet = "or"; 298 break; 299 case USCRIPT_RUNIC: 300 sRet = "ang"; 301 break; 302 case USCRIPT_SINHALA: 303 sRet = "si"; 304 break; 305 case USCRIPT_SYRIAC: 306 sRet = "syr"; 307 break; 308 case USCRIPT_TAMIL: 309 sRet = "ta"; 310 break; 311 case USCRIPT_TELUGU: 312 sRet = "te"; 313 break; 314 case USCRIPT_THAANA: 315 sRet = "dv"; 316 break; 317 case USCRIPT_THAI: 318 sRet = "th"; 319 break; 320 case USCRIPT_TIBETAN: 321 sRet = "bo"; 322 break; 323 case USCRIPT_CANADIAN_ABORIGINAL: 324 sRet = "iu"; 325 break; 326 case USCRIPT_YI: 327 sRet = "ii"; 328 break; 329 case USCRIPT_TAGALOG: 330 sRet = "tl"; 331 break; 332 case USCRIPT_HANUNOO: 333 sRet = "hnn"; 334 break; 335 case USCRIPT_BUHID: 336 sRet = "bku"; 337 break; 338 case USCRIPT_TAGBANWA: 339 sRet = "tbw"; 340 break; 341 case USCRIPT_BRAILLE: 342 sRet = "en"; 343 break; 344 case USCRIPT_CYPRIOT: 345 sRet = "ecy"; 346 break; 347 case USCRIPT_LIMBU: 348 sRet = "lif"; 349 break; 350 case USCRIPT_LINEAR_B: 351 sRet = "gmy"; 352 break; 353 case USCRIPT_OSMANYA: 354 sRet = "so"; 355 break; 356 case USCRIPT_SHAVIAN: 357 sRet = "en"; 358 break; 359 case USCRIPT_TAI_LE: 360 sRet = "tdd"; 361 break; 362 case USCRIPT_UGARITIC: 363 sRet = "uga"; 364 break; 365 case USCRIPT_KATAKANA_OR_HIRAGANA: 366 sRet = "ja"; 367 break; 368 case USCRIPT_BUGINESE: 369 sRet = "bug"; 370 break; 371 case USCRIPT_GLAGOLITIC: 372 sRet = "ch"; 373 break; 374 case USCRIPT_KHAROSHTHI: 375 sRet = "pra"; 376 break; 377 case USCRIPT_SYLOTI_NAGRI: 378 sRet = "syl"; 379 break; 380 case USCRIPT_NEW_TAI_LUE: 381 sRet = "khb"; 382 break; 383 case USCRIPT_TIFINAGH: 384 sRet = "tmh"; 385 break; 386 case USCRIPT_OLD_PERSIAN: 387 sRet = "peo"; 388 break; 389 case USCRIPT_BALINESE: 390 sRet = "ban"; 391 break; 392 case USCRIPT_BATAK: 393 sRet = "btk"; 394 break; 395 case USCRIPT_BLISSYMBOLS: 396 sRet = "en"; 397 break; 398 case USCRIPT_BRAHMI: 399 sRet = "pra"; 400 break; 401 case USCRIPT_CHAM: 402 sRet = "cja"; 403 break; 404 case USCRIPT_CIRTH: 405 sRet = "sjn"; 406 break; 407 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC: 408 sRet = "cu"; 409 break; 410 case USCRIPT_DEMOTIC_EGYPTIAN: 411 case USCRIPT_HIERATIC_EGYPTIAN: 412 case USCRIPT_EGYPTIAN_HIEROGLYPHS: 413 sRet = "egy"; 414 break; 415 case USCRIPT_KHUTSURI: 416 sRet = "ka"; 417 break; 418 case USCRIPT_SIMPLIFIED_HAN: 419 sRet = "zh"; 420 break; 421 case USCRIPT_TRADITIONAL_HAN: 422 sRet = "zh"; 423 break; 424 case USCRIPT_PAHAWH_HMONG: 425 sRet = "blu"; 426 break; 427 case USCRIPT_OLD_HUNGARIAN: 428 sRet = "ohu"; 429 break; 430 case USCRIPT_HARAPPAN_INDUS: 431 sRet = "xiv"; 432 break; 433 case USCRIPT_JAVANESE: 434 sRet = "kaw"; 435 break; 436 case USCRIPT_KAYAH_LI: 437 sRet = "eky"; 438 break; 439 case USCRIPT_LATIN_FRAKTUR: 440 sRet = "de"; 441 break; 442 case USCRIPT_LATIN_GAELIC: 443 sRet = "ga"; 444 break; 445 case USCRIPT_LEPCHA: 446 sRet = "lep"; 447 break; 448 case USCRIPT_LINEAR_A: 449 sRet = "ecr"; 450 break; 451 case USCRIPT_MAYAN_HIEROGLYPHS: 452 sRet = "myn"; 453 break; 454 case USCRIPT_MEROITIC: 455 sRet = "xmr"; 456 break; 457 case USCRIPT_NKO: 458 sRet = "nqo"; 459 break; 460 case USCRIPT_ORKHON: 461 sRet = "otk"; 462 break; 463 case USCRIPT_OLD_PERMIC: 464 sRet = "kv"; 465 break; 466 case USCRIPT_PHAGS_PA: 467 sRet = "xng"; 468 break; 469 case USCRIPT_PHOENICIAN: 470 sRet = "phn"; 471 break; 472 case USCRIPT_PHONETIC_POLLARD: 473 sRet = "hmd"; 474 break; 475 case USCRIPT_RONGORONGO: 476 sRet = "rap"; 477 break; 478 case USCRIPT_SARATI: 479 sRet = "qya"; 480 break; 481 case USCRIPT_ESTRANGELO_SYRIAC: 482 sRet = "syr"; 483 break; 484 case USCRIPT_WESTERN_SYRIAC: 485 sRet = "tru"; 486 break; 487 case USCRIPT_EASTERN_SYRIAC: 488 sRet = "aii"; 489 break; 490 case USCRIPT_TENGWAR: 491 sRet = "sjn"; 492 break; 493 case USCRIPT_VAI: 494 sRet = "vai"; 495 break; 496 case USCRIPT_VISIBLE_SPEECH: 497 sRet = "en"; 498 break; 499 case USCRIPT_CUNEIFORM: 500 sRet = "akk"; 501 break; 502 case USCRIPT_CARIAN: 503 sRet = "xcr"; 504 break; 505 case USCRIPT_JAPANESE: 506 sRet = "ja"; 507 break; 508 case USCRIPT_LANNA: 509 sRet = "nod"; 510 break; 511 case USCRIPT_LYCIAN: 512 sRet = "xlc"; 513 break; 514 case USCRIPT_LYDIAN: 515 sRet = "xld"; 516 break; 517 case USCRIPT_OL_CHIKI: 518 sRet = "sat"; 519 break; 520 case USCRIPT_REJANG: 521 sRet = "rej"; 522 break; 523 case USCRIPT_SAURASHTRA: 524 sRet = "saz"; 525 break; 526 case USCRIPT_SIGN_WRITING: 527 sRet = "en"; 528 break; 529 case USCRIPT_SUNDANESE: 530 sRet = "su"; 531 break; 532 case USCRIPT_MOON: 533 sRet = "en"; 534 break; 535 case USCRIPT_MEITEI_MAYEK: 536 sRet = "mni"; 537 break; 538 case USCRIPT_IMPERIAL_ARAMAIC: 539 sRet = "arc"; 540 break; 541 case USCRIPT_AVESTAN: 542 sRet = "ae"; 543 break; 544 case USCRIPT_CHAKMA: 545 sRet = "ccp"; 546 break; 547 case USCRIPT_KOREAN: 548 sRet = "ko"; 549 break; 550 case USCRIPT_KAITHI: 551 sRet = "awa"; 552 break; 553 case USCRIPT_MANICHAEAN: 554 sRet = "xmn"; 555 break; 556 case USCRIPT_INSCRIPTIONAL_PAHLAVI: 557 case USCRIPT_PSALTER_PAHLAVI: 558 case USCRIPT_BOOK_PAHLAVI: 559 case USCRIPT_INSCRIPTIONAL_PARTHIAN: 560 sRet = "xpr"; 561 break; 562 case USCRIPT_SAMARITAN: 563 sRet = "heb"; 564 break; 565 case USCRIPT_TAI_VIET: 566 sRet = "blt"; 567 break; 568 case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */ 569 sRet = "mic"; 570 break; 571 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 4) 572 case USCRIPT_NABATAEAN: //no language with an assigned code yet 573 sRet = "mis"; 574 break; 575 case USCRIPT_PALMYRENE: //no language with an assigned code yet 576 sRet = "mis"; 577 break; 578 case USCRIPT_BAMUM: 579 sRet = "bax"; 580 break; 581 case USCRIPT_LISU: 582 sRet = "lis"; 583 break; 584 case USCRIPT_NAKHI_GEBA: 585 sRet = "nxq"; 586 break; 587 case USCRIPT_OLD_SOUTH_ARABIAN: 588 sRet = "xsa"; 589 break; 590 case USCRIPT_BASSA_VAH: 591 sRet = "bsq"; 592 break; 593 case USCRIPT_DUPLOYAN_SHORTAND: 594 sRet = "fr"; 595 break; 596 case USCRIPT_ELBASAN: 597 sRet = "sq"; 598 break; 599 case USCRIPT_GRANTHA: 600 sRet = "ta"; 601 break; 602 case USCRIPT_KPELLE: 603 sRet = "kpe"; 604 break; 605 case USCRIPT_LOMA: 606 sRet = "lom"; 607 break; 608 case USCRIPT_MENDE: 609 sRet = "men"; 610 break; 611 case USCRIPT_MEROITIC_CURSIVE: 612 sRet = "xmr"; 613 break; 614 case USCRIPT_OLD_NORTH_ARABIAN: 615 sRet = "xna"; 616 break; 617 case USCRIPT_SINDHI: 618 sRet = "sd"; 619 break; 620 case USCRIPT_WARANG_CITI: 621 sRet = "hoc"; 622 break; 623 #endif 624 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8) 625 case USCRIPT_AFAKA: 626 sRet = "djk"; 627 break; 628 case USCRIPT_JURCHEN: 629 sRet = "juc"; 630 break; 631 case USCRIPT_MRO: 632 sRet = "cmr"; 633 break; 634 case USCRIPT_NUSHU: //no language with an assigned code yet 635 sRet = "mis"; 636 break; 637 case USCRIPT_SHARADA: 638 sRet = "sa"; 639 break; 640 case USCRIPT_SORA_SOMPENG: 641 sRet = "srb"; 642 break; 643 case USCRIPT_TAKRI: 644 sRet = "doi"; 645 break; 646 case USCRIPT_TANGUT: 647 sRet = "txg"; 648 break; 649 case USCRIPT_WOLEAI: 650 sRet = "woe"; 651 break; 652 #endif 653 #if (U_ICU_VERSION_MAJOR_NUM >= 49) 654 case USCRIPT_ANATOLIAN_HIEROGLYPHS: 655 sRet = "hlu"; 656 break; 657 case USCRIPT_KHOJKI: 658 sRet = "gu"; 659 break; 660 case USCRIPT_TIRHUTA: 661 sRet = "mai"; 662 break; 663 #endif 664 #if (U_ICU_VERSION_MAJOR_NUM >= 52) 665 case USCRIPT_CAUCASIAN_ALBANIAN: 666 sRet = "xag"; 667 break; 668 case USCRIPT_MAHAJANI: 669 sRet = "mwr"; 670 break; 671 #endif 672 #if (U_ICU_VERSION_MAJOR_NUM >= 54) 673 case USCRIPT_AHOM: 674 sRet = "aho"; 675 break; 676 case USCRIPT_HATRAN: 677 sRet = "qly-Hatr"; 678 break; 679 case USCRIPT_MODI: 680 sRet = "mr-Modi"; 681 break; 682 case USCRIPT_MULTANI: 683 sRet = "skr-Mutl"; 684 break; 685 case USCRIPT_PAU_CIN_HAU: 686 sRet = "ctd-Pauc"; 687 break; 688 case USCRIPT_SIDDHAM: 689 sRet = "sa-Sidd"; 690 break; 691 #endif 692 #if (U_ICU_VERSION_MAJOR_NUM >= 58) 693 case USCRIPT_ADLAM: 694 sRet = "mis"; // Adlm - Adlam for Fulani, no language code 695 break; 696 case USCRIPT_BHAIKSUKI: 697 sRet = "mis"; // Bhks - Bhaiksuki for some Buddhist texts, no language code 698 break; 699 case USCRIPT_MARCHEN: 700 sRet = "bo-Marc"; 701 break; 702 case USCRIPT_NEWA: 703 sRet = "new-Newa"; 704 break; 705 case USCRIPT_OSAGE: 706 sRet = "osa-Osge"; 707 break; 708 case USCRIPT_HAN_WITH_BOPOMOFO: 709 sRet = "mis"; // Hanb - Han with Bopomofo, zh-Hanb ? 710 break; 711 case USCRIPT_JAMO: 712 sRet = "mis"; // Jamo - Jamo subset of Hangul, ko-Jamo ? 713 break; 714 case USCRIPT_SYMBOLS_EMOJI: 715 sRet = "mis"; // Zsye - Emoji variant 716 break; 717 #endif 718 #if (U_ICU_VERSION_MAJOR_NUM >= 60) 719 case USCRIPT_MASARAM_GONDI: 720 sRet = "gon-Gonm"; // macro language code, could be wsg,esg,gno 721 break; 722 case USCRIPT_SOYOMBO: 723 sRet = "mn-Soyo"; // abugida to write Mongolian, also Tibetan and Sanskrit 724 break; 725 case USCRIPT_ZANABAZAR_SQUARE: 726 sRet = "mn-Zanb"; // abugida to write Mongolian 727 break; 728 #endif 729 #if (U_ICU_VERSION_MAJOR_NUM >= 62) 730 case USCRIPT_DOGRA: 731 sRet = "dgo"; // Dogri proper 732 break; 733 case USCRIPT_GUNJALA_GONDI: 734 sRet = "wsg"; // Adilabad Gondi 735 break; 736 case USCRIPT_MAKASAR: 737 sRet = "mak"; 738 break; 739 case USCRIPT_MEDEFAIDRIN: 740 sRet = "mis-Medf"; // Uncoded with script 741 break; 742 case USCRIPT_HANIFI_ROHINGYA: 743 sRet = "rhg"; 744 break; 745 case USCRIPT_SOGDIAN: 746 sRet = "sog"; 747 break; 748 case USCRIPT_OLD_SOGDIAN: 749 sRet = "sog"; 750 break; 751 #endif 752 } 753 return sRet; 754 } 755 756 //Format a number as a percentage according to the rules of the given 757 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE 758 OUString unicode::formatPercent(double dNumber, 759 const LanguageTag &rLangTag) 760 { 761 // get a currency formatter for this locale ID 762 UErrorCode errorCode=U_ZERO_ERROR; 763 764 LanguageTag aLangTag(rLangTag); 765 766 // As of CLDR Version 24 these languages were not listed as using spacing 767 // between number and % but are reported as such by our l10n groups 768 // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html 769 // so format using French which has the desired rules 770 if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl") 771 aLangTag.reset("fr-FR"); 772 773 icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag); 774 775 std::unique_ptr<icu::NumberFormat> xF( 776 icu::NumberFormat::createPercentInstance(aLocale, errorCode)); 777 if(U_FAILURE(errorCode)) 778 { 779 SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed"); 780 return OUString::number(dNumber) + "%"; 781 } 782 783 icu::UnicodeString output; 784 xF->format(dNumber/100, output); 785 OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()), 786 output.length()); 787 if (rLangTag.getLanguage() == "de") 788 { 789 //narrow no-break space instead of (normal) no-break space 790 return aRet.replace(0x00A0, 0x202F); 791 } 792 return aRet; 793 } 794 795 bool ToggleUnicodeCodepoint::AllowMoreInput(sal_Unicode uChar) 796 { 797 //arbitrarily chosen maximum length allowed - normal max usage would be around 30. 798 if( maInput.getLength() > 255 ) 799 mbAllowMoreChars = false; 800 801 if( !mbAllowMoreChars ) 802 return false; 803 804 bool bPreventNonHex = false; 805 if( maInput.indexOf("U+") != -1 ) 806 bPreventNonHex = true; 807 808 switch ( unicode::getUnicodeType(uChar) ) 809 { 810 case css::i18n::UnicodeType::SURROGATE: 811 if( bPreventNonHex ) 812 { 813 mbAllowMoreChars = false; 814 return false; 815 } 816 817 if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty() ) 818 { 819 maUtf16.append(uChar); 820 return true; 821 } 822 if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() ) 823 maUtf16.insert(0, uChar ); 824 //end of hex strings, or unexpected order of high/low, so don't accept more 825 if( !maUtf16.isEmpty() ) 826 maInput.append(maUtf16); 827 if( !maCombining.isEmpty() ) 828 maInput.append(maCombining); 829 mbAllowMoreChars = false; 830 break; 831 832 case css::i18n::UnicodeType::NON_SPACING_MARK: 833 case css::i18n::UnicodeType::COMBINING_SPACING_MARK: 834 if( bPreventNonHex ) 835 { 836 mbAllowMoreChars = false; 837 return false; 838 } 839 840 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark. 841 if( !maUtf16.isEmpty() ) 842 { 843 maInput = maUtf16; 844 if( !maCombining.isEmpty() ) 845 maInput.append(maCombining); 846 mbAllowMoreChars = false; 847 return false; 848 } 849 maCombining.insert(0, uChar); 850 break; 851 852 default: 853 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character. 854 if( !maUtf16.isEmpty() ) 855 { 856 maInput = maUtf16; 857 if( !maCombining.isEmpty() ) 858 maInput.append(maCombining); 859 mbAllowMoreChars = false; 860 return false; 861 } 862 863 if( !maCombining.isEmpty() ) 864 { 865 maCombining.insert(0, uChar); 866 maInput = maCombining; 867 mbAllowMoreChars = false; 868 return false; 869 } 870 871 // 0 - 1f are control characters. Do not process those. 872 if( uChar < 0x20 ) 873 { 874 mbAllowMoreChars = false; 875 return false; 876 } 877 878 switch( uChar ) 879 { 880 case 'u': 881 case 'U': 882 // U+ notation found. Continue looking for another one. 883 if( mbRequiresU ) 884 { 885 mbRequiresU = false; 886 maInput.insert(0,"U+"); 887 } 888 // treat as a normal character 889 else 890 { 891 mbAllowMoreChars = false; 892 if( !bPreventNonHex ) 893 maInput.insertUtf32(0, uChar); 894 } 895 break; 896 case '+': 897 // + already found: skip when not U, or edge case of +U+xxxx 898 if( mbRequiresU || (maInput.indexOf("U+") == 0) ) 899 mbAllowMoreChars = false; 900 // hex chars followed by '+' - now require a 'U' 901 else if ( !maInput.isEmpty() ) 902 mbRequiresU = true; 903 // treat as a normal character 904 else 905 { 906 mbAllowMoreChars = false; 907 if( !bPreventNonHex ) 908 maInput.insertUtf32(0, uChar); 909 } 910 break; 911 default: 912 // + already found. Since not U, cancel further input 913 if( mbRequiresU ) 914 mbAllowMoreChars = false; 915 // maximum digits per notation is 8: only one notation 916 else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 ) 917 mbAllowMoreChars = false; 918 // maximum digits per notation is 8: previous notation found 919 else if( maInput.indexOf("U+") == 8 ) 920 mbAllowMoreChars = false; 921 // a hex character. Add to string. 922 else if( rtl::isAsciiHexDigit(uChar) ) 923 { 924 mbIsHexString = true; 925 maInput.insertUtf32(0, uChar); 926 } 927 // not a hex character: stop input. keep if it is the first input provided 928 else 929 { 930 mbAllowMoreChars = false; 931 if( maInput.isEmpty() ) 932 maInput.insertUtf32(0, uChar); 933 } 934 } 935 } 936 return mbAllowMoreChars; 937 } 938 939 OUString ToggleUnicodeCodepoint::StringToReplace() 940 { 941 if( maInput.isEmpty() ) 942 { 943 //edge case - input finished with incomplete low surrogate or combining characters without a base 944 if( mbAllowMoreChars ) 945 { 946 if( !maUtf16.isEmpty() ) 947 maInput = maUtf16; 948 if( !maCombining.isEmpty() ) 949 maInput.append(maCombining); 950 } 951 return maInput.toString(); 952 } 953 954 if( !mbIsHexString ) 955 return maInput.toString(); 956 957 //this function potentially modifies the input string. Prevent addition of further characters 958 mbAllowMoreChars = false; 959 960 //validate unicode notation. 961 OUString sIn; 962 sal_uInt32 nUnicode = 0; 963 sal_Int32 nUPlus = maInput.indexOf("U+"); 964 //if U+ notation used, strip off all extra chars added not in U+ notation 965 if( nUPlus != -1 ) 966 { 967 maInput.remove(0, nUPlus); 968 sIn = maInput.copy(2).toString(); 969 nUPlus = sIn.indexOf("U+"); 970 } 971 else 972 sIn = maInput.toString(); 973 while( nUPlus != -1 ) 974 { 975 nUnicode = sIn.copy(0, nUPlus).toUInt32(16); 976 //prevent creating control characters or invalid Unicode values 977 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 ) 978 maInput = sIn.copy(nUPlus); 979 sIn = sIn.copy(nUPlus+2); 980 nUPlus = sIn.indexOf("U+"); 981 } 982 983 nUnicode = sIn.toUInt32(16); 984 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 ) 985 maInput.truncate().append( sIn[sIn.getLength()-1] ); 986 return maInput.toString(); 987 } 988 989 sal_uInt32 ToggleUnicodeCodepoint::CharsToDelete() 990 { 991 OUString sIn = StringToReplace(); 992 sal_Int32 nPos = 0; 993 sal_uInt32 counter = 0; 994 while( nPos < sIn.getLength() ) 995 { 996 sIn.iterateCodePoints(&nPos); 997 ++counter; 998 } 999 return counter; 1000 } 1001 1002 OUString ToggleUnicodeCodepoint::ReplacementString() 1003 { 1004 OUString sIn = StringToReplace(); 1005 OUStringBuffer output = ""; 1006 sal_Int32 nUPlus = sIn.indexOf("U+"); 1007 // convert from hex notation to glyph 1008 if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) ) 1009 { 1010 sal_uInt32 nUnicode = 0; 1011 if( nUPlus == 0) 1012 { 1013 sIn = sIn.copy(2); 1014 nUPlus = sIn.indexOf("U+"); 1015 } 1016 while( nUPlus > 0 ) 1017 { 1018 nUnicode = sIn.copy(0, nUPlus).toUInt32(16); 1019 output.appendUtf32( nUnicode ); 1020 1021 sIn = sIn.copy(nUPlus+2); 1022 nUPlus = sIn.indexOf("U+"); 1023 } 1024 nUnicode = sIn.toUInt32(16); 1025 output.appendUtf32( nUnicode ); 1026 } 1027 // convert from glyph to hex notation 1028 else 1029 { 1030 sal_Int32 nPos = 0; 1031 while( nPos < sIn.getLength() ) 1032 { 1033 OUStringBuffer aTmp = OUString::number(sIn.iterateCodePoints(&nPos),16); 1034 //pad with zeros - minimum length of 4. 1035 for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i ) 1036 aTmp.insert( 0,"0" ); 1037 output.append( "U+" ); 1038 output.append( aTmp ); 1039 } 1040 } 1041 return output.toString(); 1042 } 1043 1044 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ 1045
