1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 /* 3 * This file is part of the LibreOffice project. 4 * 5 * This Source Code Form is subject to the terms of the Mozilla Public 6 * License, v. 2.0. If a copy of the MPL was not distributed with this 7 * file, You can obtain one at http://mozilla.org/MPL/2.0/. 8 * 9 * This file incorporates work covered by the following license notice: 10 * 11 * Licensed to the Apache Software Foundation (ASF) under one or more 12 * contributor license agreements. See the NOTICE file distributed 13 * with this work for additional information regarding copyright 14 * ownership. The ASF licenses this file to you under the Apache 15 * License, Version 2.0 (the "License"); you may not use this file 16 * except in compliance with the License. You may obtain a copy of 17 * the License at http://www.apache.org/licenses/LICENSE-2.0 . 18 */ 19 20 #include <breakiterator_unicode.hxx> 21 #include <cppuhelper/supportsservice.hxx> 22 #include <localedata.hxx> 23 #include <i18nlangtag/languagetag.hxx> 24 #include <i18nlangtag/languagetagicu.hxx> 25 #include <unicode/uchar.h> 26 #include <unicode/locid.h> 27 #include <unicode/rbbi.h> 28 #include <unicode/udata.h> 29 #include <rtl/strbuf.hxx> 30 #include <rtl/ustring.hxx> 31 #include <string.h> 32 33 U_CDECL_BEGIN 34 extern const char OpenOffice_dat[]; 35 U_CDECL_END 36 37 using namespace ::com::sun::star; 38 using namespace ::com::sun::star::i18n; 39 using namespace ::com::sun::star::lang; 40 41 namespace i18npool { 42 43 // Cache map of breakiterators, stores state information so has to be 44 // thread_local. 45 thread_local static BreakIterator_Unicode::BIMap theBIMap; 46 47 BreakIterator_Unicode::BreakIterator_Unicode() 48 : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ) // implementation name 49 , lineRule( "line" ) 50 , icuBI( nullptr ) 51 { 52 } 53 54 BreakIterator_Unicode::~BreakIterator_Unicode() 55 { 56 } 57 58 /* 59 Wrapper class to provide public access to the icu::RuleBasedBreakIterator's 60 setbreakType method. 61 */ 62 class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator 63 { 64 public: 65 #if (U_ICU_VERSION_MAJOR_NUM < 58) 66 // icu::RuleBasedBreakIterator::setBreakType() is private as of ICU 58. 67 void publicSetBreakType(int32_t type) 68 { 69 setBreakType(type); 70 }; 71 #endif 72 OOoRuleBasedBreakIterator(UDataMemory* image, 73 UErrorCode &status) 74 : icu::RuleBasedBreakIterator(image, status) 75 { }; 76 77 }; 78 79 // loading ICU breakiterator on demand. 80 void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale, 81 sal_Int16 rBreakType, sal_Int16 nWordType, const sal_Char *rule, const OUString& rText) 82 { 83 bool bNewBreak = false; 84 UErrorCode status = U_ZERO_ERROR; 85 sal_Int16 breakType = 0; 86 switch (rBreakType) { 87 case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break; 88 case LOAD_WORD_BREAKITERATOR: 89 assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT); 90 icuBI=&words[nWordType]; 91 switch (nWordType) { 92 case WordType::ANY_WORD: break; // odd but previous behavior 93 case WordType::ANYWORD_IGNOREWHITESPACES: 94 breakType = 0; rule = "edit_word"; break; 95 case WordType::DICTIONARY_WORD: 96 breakType = 1; rule = "dict_word"; break; 97 default: 98 case WordType::WORD_COUNT: 99 breakType = 2; rule = "count_word"; break; 100 } 101 break; 102 case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break; 103 case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break; 104 } 105 106 // Using the cache map prevents accessing the file system for each 107 // udata_open() where ICU tries first files then data objects. And that for 108 // two fallbacks worst case.. for each new allocated EditEngine, layout 109 // cell, ... *ouch* Also non-rule locale based iterators can be mapped. 110 // This also speeds up loading iterators for alternating or generally more 111 // than one language/locale in that iterators are not constructed and 112 // destroyed en masse. 113 // Four possible keys, locale rule based with break type, locale rule based 114 // only, rule based only, locale based with break type. A fifth global key 115 // for the initial lookup. 116 // Multiple global keys may map to identical value data. 117 // All enums used here should be in the range 0..9 so assert that and avoid 118 // expensive numeric conversion in append() for faster construction of the 119 // always used global key. 120 assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9); 121 const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8()); 122 OStringBuffer aKeyBuf(64); 123 aKeyBuf.append( aLangtagStr).append(';'); 124 if (rule) 125 aKeyBuf.append(rule); 126 aKeyBuf.append(';').append( static_cast<sal_Char>('0'+breakType)).append(';'). 127 append( static_cast<sal_Char>('0'+rBreakType)).append(';').append( static_cast<sal_Char>('0'+nWordType)); 128 // langtag;rule;breakType;rBreakType;nWordType 129 const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear()); 130 131 if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) 132 { 133 134 auto aMapIt( theBIMap.find( aBIMapGlobalKey)); 135 bool bInMap = (aMapIt != theBIMap.end()); 136 if (bInMap) 137 icuBI->mpValue = aMapIt->second; 138 else 139 icuBI->mpValue.reset(); 140 141 if (!bInMap && rule) do { 142 uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale); 143 144 status = U_ZERO_ERROR; 145 udata_setAppData("OpenOffice", OpenOffice_dat, &status); 146 if ( !U_SUCCESS(status) ) throw uno::RuntimeException(); 147 148 std::unique_ptr<OOoRuleBasedBreakIterator> rbi; 149 150 if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty()) 151 { 152 // langtag;rule;breakType 153 const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType)); 154 aMapIt = theBIMap.find( aBIMapRuleTypeKey); 155 bInMap = (aMapIt != theBIMap.end()); 156 if (bInMap) 157 { 158 icuBI->mpValue = aMapIt->second; 159 icuBI->maBIMapKey = aBIMapGlobalKey; 160 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue)); 161 break; // do 162 } 163 164 rbi.reset(new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk", 165 OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status)); 166 167 if (U_SUCCESS(status)) 168 { 169 icuBI->mpValue.reset( new BI_ValueData); 170 icuBI->mpValue->mpBreakIterator = std::move( rbi); 171 theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue)); 172 } 173 else 174 { 175 rbi.reset(); 176 } 177 } 178 //use icu's breakiterator for Thai, Tibetan and Dzongkha 179 else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km") 180 { 181 // language;rule (not langtag, unless we'd actually load such) 182 OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8()); 183 const OString aBIMapRuleKey( aLanguage + ";" + rule); 184 aMapIt = theBIMap.find( aBIMapRuleKey); 185 bInMap = (aMapIt != theBIMap.end()); 186 if (bInMap) 187 { 188 icuBI->mpValue = aMapIt->second; 189 icuBI->maBIMapKey = aBIMapGlobalKey; 190 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue)); 191 break; // do 192 } 193 194 status = U_ZERO_ERROR; 195 OStringBuffer aUDName(64); 196 aUDName.append(rule); 197 aUDName.append('_'); 198 aUDName.append( aLanguage); 199 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status); 200 if( U_SUCCESS(status) ) 201 rbi.reset(new OOoRuleBasedBreakIterator( pUData, status)); 202 if ( U_SUCCESS(status) ) 203 { 204 icuBI->mpValue.reset( new BI_ValueData); 205 icuBI->mpValue->mpBreakIterator = std::move( rbi); 206 theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue)); 207 } 208 else 209 { 210 rbi.reset(); 211 212 // ;rule (only) 213 const OString aBIMapRuleOnlyKey( OString(";") + rule); 214 aMapIt = theBIMap.find( aBIMapRuleOnlyKey); 215 bInMap = (aMapIt != theBIMap.end()); 216 if (bInMap) 217 { 218 icuBI->mpValue = aMapIt->second; 219 icuBI->maBIMapKey = aBIMapGlobalKey; 220 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue)); 221 break; // do 222 } 223 224 status = U_ZERO_ERROR; 225 pUData = udata_open("OpenOffice", "brk", rule, &status); 226 if( U_SUCCESS(status) ) 227 rbi.reset(new OOoRuleBasedBreakIterator( pUData, status)); 228 if ( U_SUCCESS(status) ) 229 { 230 icuBI->mpValue.reset( new BI_ValueData); 231 icuBI->mpValue->mpBreakIterator = std::move( rbi); 232 theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue)); 233 } 234 else 235 { 236 rbi.reset(); 237 } 238 } 239 } 240 if (rbi) { 241 #if (U_ICU_VERSION_MAJOR_NUM < 58) 242 // ICU 58 made RuleBasedBreakIterator::setBreakType() private 243 // instead of protected, so the old workaround of 244 // https://ssl.icu-project.org/trac/ticket/5498 245 // doesn't work anymore. However, they also claim to have fixed 246 // the cause that an initial fBreakType==-1 would lead to an 247 // endless loop under some circumstances. 248 // Let's see ... 249 switch (rBreakType) { 250 case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break; 251 case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break; 252 case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break; 253 case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break; 254 } 255 #endif 256 } 257 } while (false); 258 259 if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) do { 260 // langtag;;;rBreakType (empty rule; empty breakType) 261 const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType)); 262 aMapIt = theBIMap.find( aBIMapLocaleTypeKey); 263 bInMap = (aMapIt != theBIMap.end()); 264 if (bInMap) 265 { 266 icuBI->mpValue = aMapIt->second; 267 icuBI->maBIMapKey = aBIMapGlobalKey; 268 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue)); 269 break; // do 270 } 271 272 icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale))); 273 std::shared_ptr< icu::BreakIterator > pBI; 274 275 status = U_ZERO_ERROR; 276 switch (rBreakType) { 277 case LOAD_CHARACTER_BREAKITERATOR: 278 pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) ); 279 break; 280 case LOAD_WORD_BREAKITERATOR: 281 pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) ); 282 break; 283 case LOAD_SENTENCE_BREAKITERATOR: 284 pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) ); 285 break; 286 case LOAD_LINE_BREAKITERATOR: 287 pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) ); 288 break; 289 } 290 if ( !U_SUCCESS(status) || !pBI ) { 291 throw uno::RuntimeException(); 292 } 293 icuBI->mpValue.reset( new BI_ValueData); 294 icuBI->mpValue->mpBreakIterator = pBI; 295 theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue)); 296 } while (false); 297 if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) { 298 throw uno::RuntimeException(); 299 } 300 icuBI->maBIMapKey = aBIMapGlobalKey; 301 if (!bInMap) 302 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue)); 303 bNewBreak=true; 304 } 305 306 if (bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData) 307 { 308 const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr()); 309 310 status = U_ZERO_ERROR; 311 icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status); 312 313 if (!U_SUCCESS(status)) 314 throw uno::RuntimeException(); 315 316 icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status); 317 318 if (!U_SUCCESS(status)) 319 throw uno::RuntimeException(); 320 321 icuBI->mpValue->maICUText = rText; 322 } 323 } 324 325 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text, 326 sal_Int32 nStartPos, const lang::Locale &rLocale, 327 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone ) 328 { 329 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode 330 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text); 331 icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get(); 332 for (nDone = 0; nDone < nCount; nDone++) { 333 nStartPos = pBI->following(nStartPos); 334 if (nStartPos == icu::BreakIterator::DONE) 335 return Text.getLength(); 336 } 337 } else { // for CHARACTER mode 338 for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++) 339 Text.iterateCodePoints(&nStartPos); 340 } 341 return nStartPos; 342 } 343 344 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text, 345 sal_Int32 nStartPos, const lang::Locale& rLocale, 346 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone ) 347 { 348 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode 349 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text); 350 icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get(); 351 for (nDone = 0; nDone < nCount; nDone++) { 352 nStartPos = pBI->preceding(nStartPos); 353 if (nStartPos == icu::BreakIterator::DONE) 354 return 0; 355 } 356 } else { // for BS to delete one char and CHARACTER mode. 357 for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++) 358 Text.iterateCodePoints(&nStartPos, -1); 359 } 360 return nStartPos; 361 } 362 363 364 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos, 365 const lang::Locale& rLocale, sal_Int16 rWordType ) 366 { 367 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text); 368 369 Boundary rv; 370 rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos); 371 if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE ) 372 rv.endPos = result.startPos; 373 else { 374 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES || 375 rWordType == WordType::DICTIONARY_WORD ) && 376 u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) ) 377 rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos); 378 379 rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos); 380 if(rv.endPos == icu::BreakIterator::DONE) 381 rv.endPos = rv.startPos; 382 } 383 return rv; 384 } 385 386 387 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos, 388 const lang::Locale& rLocale, sal_Int16 rWordType) 389 { 390 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text); 391 392 Boundary rv; 393 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos); 394 if( rv.startPos < 0) 395 rv.endPos = rv.startPos; 396 else { 397 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES || 398 rWordType == WordType::DICTIONARY_WORD) && 399 u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) ) 400 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos); 401 402 rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos); 403 if(rv.endPos == icu::BreakIterator::DONE) 404 rv.endPos = rv.startPos; 405 } 406 return rv; 407 } 408 409 410 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale, 411 sal_Int16 rWordType, sal_Bool bDirection ) 412 { 413 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text); 414 sal_Int32 len = Text.getLength(); 415 416 Boundary rv; 417 if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) { 418 rv.startPos = rv.endPos = nPos; 419 if((bDirection || nPos == 0) && nPos < len) //forward 420 rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos); 421 else 422 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos); 423 } else { 424 if(nPos <= 0) { 425 rv.startPos = 0; 426 rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0; 427 } else if(nPos >= len) { 428 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len); 429 rv.endPos = len; 430 } else { 431 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos); 432 rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos); 433 } 434 } 435 if (rv.startPos == icu::BreakIterator::DONE) 436 rv.startPos = rv.endPos; 437 else if (rv.endPos == icu::BreakIterator::DONE) 438 rv.endPos = rv.startPos; 439 440 return rv; 441 } 442 443 444 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos, 445 const lang::Locale &rLocale ) 446 { 447 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text); 448 449 sal_Int32 len = Text.getLength(); 450 if (len > 0 && nStartPos == len) 451 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence 452 if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos)) 453 nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos); 454 455 // skip preceding space. 456 sal_uInt32 ch = Text.iterateCodePoints(&nStartPos); 457 while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos); 458 Text.iterateCodePoints(&nStartPos, -1); 459 460 return nStartPos; 461 } 462 463 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos, 464 const lang::Locale &rLocale ) 465 { 466 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text); 467 468 sal_Int32 len = Text.getLength(); 469 if (len > 0 && nStartPos == len) 470 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence 471 nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos); 472 473 sal_Int32 nPos=nStartPos; 474 while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos; 475 476 return nStartPos; 477 } 478 479 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak( 480 const OUString& Text, sal_Int32 nStartPos, 481 const lang::Locale& rLocale, sal_Int32 nMinBreakPos, 482 const LineBreakHyphenationOptions& hOptions, 483 const LineBreakUserOptions& /*rOptions*/ ) 484 { 485 LineBreakResults lbr; 486 487 if (nStartPos >= Text.getLength()) { 488 lbr.breakIndex = Text.getLength(); 489 lbr.breakType = BreakType::WORDBOUNDARY; 490 return lbr; 491 } 492 493 loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text); 494 495 icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get(); 496 bool GlueSpace=true; 497 while (GlueSpace) { 498 if (pLineBI->preceding(nStartPos + 1) == nStartPos) { //Line boundary break 499 lbr.breakIndex = nStartPos; 500 lbr.breakType = BreakType::WORDBOUNDARY; 501 } else if (hOptions.rHyphenator.is()) { //Hyphenation break 502 sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0; 503 pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word" 504 505 sal_Int32 nStartPosWordEnd = nStartPos; 506 while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation 507 nStartPosWordEnd --; 508 509 Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale, 510 WordType::DICTIONARY_WORD, false); 511 512 nStartPosWordEnd = wBoundary.endPos; 513 while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation 514 nStartPosWordEnd ++; 515 nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos; 516 if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos; 517 #define SPACE 0x0020 518 while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE); 519 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos, 520 wBoundary.endPos - wBoundary.startPos), rLocale, 521 static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions); 522 if (aHyphenatedWord.is()) { 523 lbr.rHyphenatedWord = aHyphenatedWord; 524 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos ) 525 lbr.breakIndex = -1; 526 else 527 lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos(); 528 lbr.breakType = BreakType::HYPHENATION; 529 530 // check not optimal hyphenation of "word-word" (word with hyphens) 531 if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) { 532 lbr.breakIndex = pLineBI->current(); 533 lbr.breakType = BreakType::WORDBOUNDARY; 534 } 535 536 } else { 537 lbr.breakIndex = pLineBI->preceding(nStartPos); 538 lbr.breakType = BreakType::WORDBOUNDARY; 539 } 540 } else { //word boundary break 541 lbr.breakIndex = pLineBI->preceding(nStartPos); 542 lbr.breakType = BreakType::WORDBOUNDARY; 543 544 // Special case for Slash U+002F SOLIDUS in URI and path names. 545 // TR14 defines that as SY: Symbols Allowing Break After (A). 546 // This is unwanted in paths, see also i#17155 547 if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/') 548 { 549 // Look backward and take any whitespace before as a break 550 // opportunity. This also glues something like "w/o". 551 // Avoid an overly long path and break it as was indicated. 552 // Overly long here is arbitrarily defined. 553 const sal_Int32 nOverlyLong = 66; 554 sal_Int32 nPos = lbr.breakIndex - 1; 555 while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong) 556 { 557 if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1))) 558 { 559 lbr.breakIndex = nPos + 1; 560 break; 561 } 562 } 563 } 564 } 565 566 #define WJ 0x2060 // Word Joiner 567 GlueSpace=false; 568 if (lbr.breakType == BreakType::WORDBOUNDARY) { 569 nStartPos = lbr.breakIndex; 570 if (nStartPos >= 0 && Text[nStartPos--] == WJ) 571 GlueSpace=true; 572 while (nStartPos >= 0 && 573 (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) { 574 if (Text[nStartPos--] == WJ) 575 GlueSpace=true; 576 } 577 if (GlueSpace && nStartPos < 0) { 578 lbr.breakIndex = 0; 579 break; 580 } 581 } 582 } 583 584 return lbr; 585 } 586 587 OUString SAL_CALL 588 BreakIterator_Unicode::getImplementationName() 589 { 590 return OUString::createFromAscii(cBreakIterator); 591 } 592 593 sal_Bool SAL_CALL 594 BreakIterator_Unicode::supportsService(const OUString& rServiceName) 595 { 596 return cppu::supportsService(this, rServiceName); 597 } 598 599 uno::Sequence< OUString > SAL_CALL 600 BreakIterator_Unicode::getSupportedServiceNames() 601 { 602 uno::Sequence< OUString > aRet { OUString::createFromAscii(cBreakIterator) }; 603 return aRet; 604 } 605 606 } 607 608 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface * 609 com_sun_star_i18n_BreakIterator_Unicode_get_implementation( 610 css::uno::XComponentContext *, 611 css::uno::Sequence<css::uno::Any> const &) 612 { 613 return cppu::acquire(new i18npool::BreakIterator_Unicode()); 614 } 615 616 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ 617
