1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <breakiterator_unicode.hxx>
21 #include <cppuhelper/supportsservice.hxx>
22 #include <localedata.hxx>
23 #include <i18nlangtag/languagetag.hxx>
24 #include <i18nlangtag/languagetagicu.hxx>
25 #include <unicode/uchar.h>
26 #include <unicode/locid.h>
27 #include <unicode/rbbi.h>
28 #include <unicode/udata.h>
29 #include <rtl/strbuf.hxx>
30 #include <rtl/ustring.hxx>
31 #include <string.h>
32 
33 U_CDECL_BEGIN
34 extern const char OpenOffice_dat[];
35 U_CDECL_END
36 
37 using namespace ::com::sun::star;
38 using namespace ::com::sun::star::i18n;
39 using namespace ::com::sun::star::lang;
40 
41 namespace i18npool {
42 
43 // Cache map of breakiterators, stores state information so has to be
44 // thread_local.
45 thread_local static BreakIterator_Unicode::BIMap theBIMap;
46 
47 BreakIterator_Unicode::BreakIterator_Unicode()
48     : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" )    // implementation name
49     , lineRule( "line" )
50     , icuBI( nullptr )
51 {
52 }
53 
54 BreakIterator_Unicode::~BreakIterator_Unicode()
55 {
56 }
57 
58 /*
59     Wrapper class to provide public access to the icu::RuleBasedBreakIterator's
60     setbreakType method.
61 */
62 class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
63 {
64     public:
65 #if (U_ICU_VERSION_MAJOR_NUM < 58)
66     // icu::RuleBasedBreakIterator::setBreakType() is private as of ICU 58.
67     void publicSetBreakType(int32_t type)
68         {
69             setBreakType(type);
70         };
71 #endif
72     OOoRuleBasedBreakIterator(UDataMemory* image,
73                               UErrorCode &status)
74         : icu::RuleBasedBreakIterator(image, status)
75         { };
76 
77 };
78 
79 // loading ICU breakiterator on demand.
80 void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale,
81         sal_Int16 rBreakType, sal_Int16 nWordType, const sal_Char *rule, const OUString& rText)
82 {
83     bool bNewBreak = false;
84     UErrorCode status = U_ZERO_ERROR;
85     sal_Int16 breakType = 0;
86     switch (rBreakType) {
87         case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
88         case LOAD_WORD_BREAKITERATOR:
89             assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
90             icuBI=&words[nWordType];
91             switch (nWordType) {
92                 case WordType::ANY_WORD: break; // odd but previous behavior
93                 case WordType::ANYWORD_IGNOREWHITESPACES:
94                     breakType = 0; rule = "edit_word"; break;
95                 case WordType::DICTIONARY_WORD:
96                     breakType = 1; rule = "dict_word"; break;
97                 default:
98                 case WordType::WORD_COUNT:
99                     breakType = 2; rule = "count_word"; break;
100             }
101             break;
102         case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
103         case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
104     }
105 
106     // Using the cache map prevents accessing the file system for each
107     // udata_open() where ICU tries first files then data objects. And that for
108     // two fallbacks worst case.. for each new allocated EditEngine, layout
109     // cell, ... *ouch*  Also non-rule locale based iterators can be mapped.
110     // This also speeds up loading iterators for alternating or generally more
111     // than one language/locale in that iterators are not constructed and
112     // destroyed en masse.
113     // Four possible keys, locale rule based with break type, locale rule based
114     // only, rule based only, locale based with break type. A fifth global key
115     // for the initial lookup.
116     // Multiple global keys may map to identical value data.
117     // All enums used here should be in the range 0..9 so assert that and avoid
118     // expensive numeric conversion in append() for faster construction of the
119     // always used global key.
120     assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);
121     const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8());
122     OStringBuffer aKeyBuf(64);
123     aKeyBuf.append( aLangtagStr).append(';');
124     if (rule)
125         aKeyBuf.append(rule);
126     aKeyBuf.append(';').append( static_cast<sal_Char>('0'+breakType)).append(';').
127         append( static_cast<sal_Char>('0'+rBreakType)).append(';').append( static_cast<sal_Char>('0'+nWordType));
128     // langtag;rule;breakType;rBreakType;nWordType
129     const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());
130 
131     if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
132     {
133 
134         auto aMapIt( theBIMap.find( aBIMapGlobalKey));
135         bool bInMap = (aMapIt != theBIMap.end());
136         if (bInMap)
137             icuBI->mpValue = aMapIt->second;
138         else
139             icuBI->mpValue.reset();
140 
141         if (!bInMap && rule) do {
142             uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale);
143 
144             status = U_ZERO_ERROR;
145             udata_setAppData("OpenOffice", OpenOffice_dat, &status);
146             if ( !U_SUCCESS(status) ) throw uno::RuntimeException();
147 
148             std::unique_ptr<OOoRuleBasedBreakIterator> rbi;
149 
150             if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
151             {
152                 // langtag;rule;breakType
153                 const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType));
154                 aMapIt = theBIMap.find( aBIMapRuleTypeKey);
155                 bInMap = (aMapIt != theBIMap.end());
156                 if (bInMap)
157                 {
158                     icuBI->mpValue = aMapIt->second;
159                     icuBI->maBIMapKey = aBIMapGlobalKey;
160                     theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
161                     break;  // do
162                 }
163 
164                 rbi.reset(new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
165                     OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status));
166 
167                 if (U_SUCCESS(status))
168                 {
169                     icuBI->mpValue.reset( new BI_ValueData);
170                     icuBI->mpValue->mpBreakIterator = std::move( rbi);
171                     theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue));
172                 }
173                 else
174                 {
175                     rbi.reset();
176                 }
177             }
178             //use icu's breakiterator for Thai, Tibetan and Dzongkha
179             else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km")
180             {
181                 // language;rule (not langtag, unless we'd actually load such)
182                 OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());
183                 const OString aBIMapRuleKey( aLanguage + ";" + rule);
184                 aMapIt = theBIMap.find( aBIMapRuleKey);
185                 bInMap = (aMapIt != theBIMap.end());
186                 if (bInMap)
187                 {
188                     icuBI->mpValue = aMapIt->second;
189                     icuBI->maBIMapKey = aBIMapGlobalKey;
190                     theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
191                     break;  // do
192                 }
193 
194                 status = U_ZERO_ERROR;
195                 OStringBuffer aUDName(64);
196                 aUDName.append(rule);
197                 aUDName.append('_');
198                 aUDName.append( aLanguage);
199                 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
200                 if( U_SUCCESS(status) )
201                     rbi.reset(new OOoRuleBasedBreakIterator( pUData, status));
202                 if ( U_SUCCESS(status) )
203                 {
204                     icuBI->mpValue.reset( new BI_ValueData);
205                     icuBI->mpValue->mpBreakIterator = std::move( rbi);
206                     theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue));
207                 }
208                 else
209                 {
210                     rbi.reset();
211 
212                     // ;rule (only)
213                     const OString aBIMapRuleOnlyKey( OString(";") + rule);
214                     aMapIt = theBIMap.find( aBIMapRuleOnlyKey);
215                     bInMap = (aMapIt != theBIMap.end());
216                     if (bInMap)
217                     {
218                         icuBI->mpValue = aMapIt->second;
219                         icuBI->maBIMapKey = aBIMapGlobalKey;
220                         theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
221                         break;  // do
222                     }
223 
224                     status = U_ZERO_ERROR;
225                     pUData = udata_open("OpenOffice", "brk", rule, &status);
226                     if( U_SUCCESS(status) )
227                         rbi.reset(new OOoRuleBasedBreakIterator( pUData, status));
228                     if ( U_SUCCESS(status) )
229                     {
230                         icuBI->mpValue.reset( new BI_ValueData);
231                         icuBI->mpValue->mpBreakIterator = std::move( rbi);
232                         theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue));
233                     }
234                     else
235                     {
236                         rbi.reset();
237                     }
238                 }
239             }
240             if (rbi) {
241 #if (U_ICU_VERSION_MAJOR_NUM < 58)
242                 // ICU 58 made RuleBasedBreakIterator::setBreakType() private
243                 // instead of protected, so the old workaround of
244                 // https://ssl.icu-project.org/trac/ticket/5498
245                 // doesn't work anymore. However, they also claim to have fixed
246                 // the cause that an initial fBreakType==-1 would lead to an
247                 // endless loop under some circumstances.
248                 // Let's see ...
249                 switch (rBreakType) {
250                     case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
251                     case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
252                     case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
253                     case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
254                 }
255 #endif
256             }
257         } while (false);
258 
259         if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) do {
260             // langtag;;;rBreakType (empty rule; empty breakType)
261             const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType));
262             aMapIt = theBIMap.find( aBIMapLocaleTypeKey);
263             bInMap = (aMapIt != theBIMap.end());
264             if (bInMap)
265             {
266                 icuBI->mpValue = aMapIt->second;
267                 icuBI->maBIMapKey = aBIMapGlobalKey;
268                 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
269                 break;  // do
270             }
271 
272             icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
273             std::shared_ptr< icu::BreakIterator > pBI;
274 
275             status = U_ZERO_ERROR;
276             switch (rBreakType) {
277                 case LOAD_CHARACTER_BREAKITERATOR:
278                     pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );
279                     break;
280                 case LOAD_WORD_BREAKITERATOR:
281                     pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );
282                     break;
283                 case LOAD_SENTENCE_BREAKITERATOR:
284                     pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );
285                     break;
286                 case LOAD_LINE_BREAKITERATOR:
287                     pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );
288                     break;
289             }
290             if ( !U_SUCCESS(status) || !pBI ) {
291                 throw uno::RuntimeException();
292             }
293             icuBI->mpValue.reset( new BI_ValueData);
294             icuBI->mpValue->mpBreakIterator = pBI;
295             theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue));
296         } while (false);
297         if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) {
298             throw uno::RuntimeException();
299         }
300         icuBI->maBIMapKey = aBIMapGlobalKey;
301         if (!bInMap)
302             theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
303         bNewBreak=true;
304     }
305 
306     if (bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData)
307     {
308         const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
309 
310         status = U_ZERO_ERROR;
311         icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status);
312 
313         if (!U_SUCCESS(status))
314             throw uno::RuntimeException();
315 
316         icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status);
317 
318         if (!U_SUCCESS(status))
319             throw uno::RuntimeException();
320 
321         icuBI->mpValue->maICUText = rText;
322     }
323 }
324 
325 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
326         sal_Int32 nStartPos, const lang::Locale &rLocale,
327         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
328 {
329     if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
330         loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
331         icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
332         for (nDone = 0; nDone < nCount; nDone++) {
333             nStartPos = pBI->following(nStartPos);
334             if (nStartPos == icu::BreakIterator::DONE)
335                 return Text.getLength();
336         }
337     } else { // for CHARACTER mode
338         for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
339             Text.iterateCodePoints(&nStartPos);
340     }
341     return nStartPos;
342 }
343 
344 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
345         sal_Int32 nStartPos, const lang::Locale& rLocale,
346         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
347 {
348     if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
349         loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
350         icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
351         for (nDone = 0; nDone < nCount; nDone++) {
352             nStartPos = pBI->preceding(nStartPos);
353             if (nStartPos == icu::BreakIterator::DONE)
354                 return 0;
355         }
356     } else { // for BS to delete one char and CHARACTER mode.
357         for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
358             Text.iterateCodePoints(&nStartPos, -1);
359     }
360     return nStartPos;
361 }
362 
363 
364 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
365     const lang::Locale& rLocale, sal_Int16 rWordType )
366 {
367     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
368 
369     Boundary rv;
370     rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
371     if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
372         rv.endPos = result.startPos;
373     else {
374         if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
375                     rWordType == WordType::DICTIONARY_WORD ) &&
376                 u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) )
377             rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
378 
379         rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
380         if(rv.endPos == icu::BreakIterator::DONE)
381             rv.endPos = rv.startPos;
382     }
383     return rv;
384 }
385 
386 
387 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
388         const lang::Locale& rLocale, sal_Int16 rWordType)
389 {
390     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
391 
392     Boundary rv;
393     rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos);
394     if( rv.startPos < 0)
395         rv.endPos = rv.startPos;
396     else {
397         if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
398                     rWordType == WordType::DICTIONARY_WORD) &&
399                 u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) )
400             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos);
401 
402         rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
403         if(rv.endPos == icu::BreakIterator::DONE)
404             rv.endPos = rv.startPos;
405     }
406     return rv;
407 }
408 
409 
410 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
411         sal_Int16 rWordType, sal_Bool bDirection )
412 {
413     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
414     sal_Int32 len = Text.getLength();
415 
416     Boundary rv;
417     if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) {
418         rv.startPos = rv.endPos = nPos;
419         if((bDirection || nPos == 0) && nPos < len) //forward
420             rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
421         else
422             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
423     } else {
424         if(nPos <= 0) {
425             rv.startPos = 0;
426             rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;
427         } else if(nPos >= len) {
428             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len);
429             rv.endPos = len;
430         } else {
431             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
432             rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
433         }
434     }
435     if (rv.startPos == icu::BreakIterator::DONE)
436         rv.startPos = rv.endPos;
437     else if (rv.endPos == icu::BreakIterator::DONE)
438         rv.endPos = rv.startPos;
439 
440     return rv;
441 }
442 
443 
444 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
445         const lang::Locale &rLocale )
446 {
447     loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
448 
449     sal_Int32 len = Text.getLength();
450     if (len > 0 && nStartPos == len)
451         Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
452     if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos))
453         nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos);
454 
455     // skip preceding space.
456     sal_uInt32 ch = Text.iterateCodePoints(&nStartPos);
457     while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos);
458     Text.iterateCodePoints(&nStartPos, -1);
459 
460     return nStartPos;
461 }
462 
463 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
464         const lang::Locale &rLocale )
465 {
466     loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
467 
468     sal_Int32 len = Text.getLength();
469     if (len > 0 && nStartPos == len)
470         Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
471     nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos);
472 
473     sal_Int32 nPos=nStartPos;
474     while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
475 
476     return nStartPos;
477 }
478 
479 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
480         const OUString& Text, sal_Int32 nStartPos,
481         const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
482         const LineBreakHyphenationOptions& hOptions,
483         const LineBreakUserOptions& /*rOptions*/ )
484 {
485     LineBreakResults lbr;
486 
487     if (nStartPos >= Text.getLength()) {
488         lbr.breakIndex = Text.getLength();
489         lbr.breakType = BreakType::WORDBOUNDARY;
490         return lbr;
491     }
492 
493     loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
494 
495     icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get();
496     bool GlueSpace=true;
497     while (GlueSpace) {
498         if (pLineBI->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
499             lbr.breakIndex = nStartPos;
500             lbr.breakType = BreakType::WORDBOUNDARY;
501         } else if (hOptions.rHyphenator.is()) { //Hyphenation break
502             sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;
503             pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"
504 
505             sal_Int32 nStartPosWordEnd = nStartPos;
506             while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation
507                 nStartPosWordEnd --;
508 
509             Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,
510                 WordType::DICTIONARY_WORD, false);
511 
512             nStartPosWordEnd = wBoundary.endPos;
513             while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation
514                 nStartPosWordEnd ++;
515             nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
516             if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
517 #define SPACE 0x0020
518             while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);
519             uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
520                         wBoundary.endPos - wBoundary.startPos), rLocale,
521                     static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
522             if (aHyphenatedWord.is()) {
523                 lbr.rHyphenatedWord = aHyphenatedWord;
524                 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
525                     lbr.breakIndex = -1;
526                 else
527                     lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
528                 lbr.breakType = BreakType::HYPHENATION;
529 
530                 // check not optimal hyphenation of "word-word" (word with hyphens)
531                 if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {
532                     lbr.breakIndex = pLineBI->current();
533                     lbr.breakType = BreakType::WORDBOUNDARY;
534                 }
535 
536             } else {
537                 lbr.breakIndex = pLineBI->preceding(nStartPos);
538                 lbr.breakType = BreakType::WORDBOUNDARY;
539             }
540         } else { //word boundary break
541             lbr.breakIndex = pLineBI->preceding(nStartPos);
542             lbr.breakType = BreakType::WORDBOUNDARY;
543 
544             // Special case for Slash U+002F SOLIDUS in URI and path names.
545             // TR14 defines that as SY: Symbols Allowing Break After (A).
546             // This is unwanted in paths, see also i#17155
547             if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/')
548             {
549                 // Look backward and take any whitespace before as a break
550                 // opportunity. This also glues something like "w/o".
551                 // Avoid an overly long path and break it as was indicated.
552                 // Overly long here is arbitrarily defined.
553                 const sal_Int32 nOverlyLong = 66;
554                 sal_Int32 nPos = lbr.breakIndex - 1;
555                 while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong)
556                 {
557                     if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1)))
558                     {
559                         lbr.breakIndex = nPos + 1;
560                         break;
561                     }
562                 }
563             }
564         }
565 
566 #define WJ 0x2060   // Word Joiner
567         GlueSpace=false;
568         if (lbr.breakType == BreakType::WORDBOUNDARY) {
569             nStartPos = lbr.breakIndex;
570             if (nStartPos >= 0 && Text[nStartPos--] == WJ)
571                 GlueSpace=true;
572             while (nStartPos >= 0 &&
573                     (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
574                 if (Text[nStartPos--] == WJ)
575                     GlueSpace=true;
576             }
577             if (GlueSpace && nStartPos < 0)  {
578                 lbr.breakIndex = 0;
579                 break;
580             }
581         }
582     }
583 
584     return lbr;
585 }
586 
587 OUString SAL_CALL
588 BreakIterator_Unicode::getImplementationName()
589 {
590     return OUString::createFromAscii(cBreakIterator);
591 }
592 
593 sal_Bool SAL_CALL
594 BreakIterator_Unicode::supportsService(const OUString& rServiceName)
595 {
596     return cppu::supportsService(this, rServiceName);
597 }
598 
599 uno::Sequence< OUString > SAL_CALL
600 BreakIterator_Unicode::getSupportedServiceNames()
601 {
602     uno::Sequence< OUString > aRet { OUString::createFromAscii(cBreakIterator) };
603     return aRet;
604 }
605 
606 }
607 
608 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
609 com_sun_star_i18n_BreakIterator_Unicode_get_implementation(
610     css::uno::XComponentContext *,
611     css::uno::Sequence<css::uno::Any> const &)
612 {
613     return cppu::acquire(new i18npool::BreakIterator_Unicode());
614 }
615 
616 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
617