xref: /core/i18npool/source/breakiterator/breakiterator_unicode.cxx (revision 9a14a0fd8b4227b5d08b3154cddca46f82ec2a03)
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <breakiterator_unicode.hxx>
21 #include <cppuhelper/supportsservice.hxx>
22 #include <localedata.hxx>
23 #include <i18nlangtag/languagetag.hxx>
24 #include <i18nlangtag/languagetagicu.hxx>
25 #include <unicode/uchar.h>
26 #include <unicode/locid.h>
27 #include <unicode/rbbi.h>
28 #include <unicode/udata.h>
29 #include <rtl/strbuf.hxx>
30 #include <rtl/ustring.hxx>
31 
32 #include <com/sun/star/i18n/BreakType.hpp>
33 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
34 #include <com/sun/star/i18n/WordType.hpp>
35 
36 U_CDECL_BEGIN
37 extern const char OpenOffice_dat[];
38 U_CDECL_END
39 
40 using namespace ::com::sun::star;
41 using namespace ::com::sun::star::i18n;
42 using namespace ::com::sun::star::lang;
43 
44 namespace i18npool {
45 
46 // Cache map of breakiterators, stores state information so has to be
47 // thread_local.
48 thread_local static BreakIterator_Unicode::BIMap theBIMap;
49 
BreakIterator_Unicode()50 BreakIterator_Unicode::BreakIterator_Unicode()
51     : cBreakIterator( u"com.sun.star.i18n.BreakIterator_Unicode"_ustr )    // implementation name
52     , lineRule( "line" )
53     , icuBI( nullptr )
54 {
55 }
56 
~BreakIterator_Unicode()57 BreakIterator_Unicode::~BreakIterator_Unicode()
58 {
59 }
60 
61 namespace {
62 
63 /*
64     Wrapper class to provide public access to the icu::RuleBasedBreakIterator's
65     setbreakType method.
66 */
67 class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
68 {
69     public:
OOoRuleBasedBreakIterator(UDataMemory * image,UErrorCode & status)70     OOoRuleBasedBreakIterator(UDataMemory* image,
71                               UErrorCode &status)
72         : icu::RuleBasedBreakIterator(image, status)
73         { };
74 
75 };
76 
77 }
78 
79 // loading ICU breakiterator on demand.
loadICUBreakIterator(const css::lang::Locale & rLocale,sal_Int16 rBreakType,sal_Int16 nWordType,const char * rule,const OUString & rText)80 void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale,
81         sal_Int16 rBreakType, sal_Int16 nWordType, const char *rule, const OUString& rText)
82 {
83     bool bNewBreak = false;
84     UErrorCode status = U_ZERO_ERROR;
85     sal_Int16 breakType = 0;
86     switch (rBreakType) {
87         case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
88         case LOAD_WORD_BREAKITERATOR:
89             assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
90             icuBI=&words[nWordType];
91             switch (nWordType) {
92                 case WordType::ANY_WORD: break; // odd but previous behavior
93                 case WordType::ANYWORD_IGNOREWHITESPACES:
94                     breakType = 0; rule = "edit_word"; break;
95                 case WordType::DICTIONARY_WORD:
96                     breakType = 1; rule = "dict_word"; break;
97                 default:
98                 case WordType::WORD_COUNT:
99                     breakType = 2; rule = "count_word"; break;
100             }
101             break;
102         case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
103         case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
104     }
105 
106     // Using the cache map prevents accessing the file system for each
107     // udata_open() where ICU tries first files then data objects. And that for
108     // two fallbacks worst case... for each new allocated EditEngine, layout
109     // cell, ... *ouch*  Also non-rule locale based iterators can be mapped.
110     // This also speeds up loading iterators for alternating or generally more
111     // than one language/locale in that iterators are not constructed and
112     // destroyed en masse.
113     // Four possible keys, locale rule based with break type, locale rule based
114     // only, rule based only, locale based with break type. A fifth global key
115     // for the initial lookup.
116     // Multiple global keys may map to identical value data.
117     // All enums used here should be in the range 0..9 so assert that and avoid
118     // expensive numeric conversion in append() for faster construction of the
119     // always used global key.
120     assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);
121     const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8());
122     OStringBuffer aKeyBuf(64);
123     aKeyBuf.append( aLangtagStr + ";" );
124     if (rule)
125         aKeyBuf.append(rule);
126     aKeyBuf.append(";" + OStringChar(static_cast<char>('0'+breakType)) + ";"
127         + OStringChar(static_cast<char>('0'+rBreakType)) + ";"
128         + OStringChar( static_cast<char>('0'+nWordType)));
129     // langtag;rule;breakType;rBreakType;nWordType
130     const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());
131 
132     if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
133     {
134 
135         auto aMapIt( theBIMap.find( aBIMapGlobalKey));
136         bool bInMap = (aMapIt != theBIMap.end());
137         if (bInMap)
138             icuBI->mpValue = aMapIt->second;
139         else
140             icuBI->mpValue.reset();
141 
142         if (!bInMap && rule)
143             do
144             {
145                 const uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale);
146 
147                 status = U_ZERO_ERROR;
148                 udata_setAppData("OpenOffice", OpenOffice_dat, &status);
149                 if ( !U_SUCCESS(status) )
150                     throw uno::RuntimeException("udata_setAppData returned error " + OUString::createFromAscii(u_errorName(status)));
151 
152                 std::shared_ptr<OOoRuleBasedBreakIterator> rbi;
153 
154                 if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
155                 {
156                     // langtag;rule;breakType
157                     const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType));
158                     aMapIt = theBIMap.find( aBIMapRuleTypeKey);
159                     bInMap = (aMapIt != theBIMap.end());
160                     if (bInMap)
161                     {
162                         icuBI->mpValue = aMapIt->second;
163                         icuBI->maBIMapKey = aBIMapGlobalKey;
164                         theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
165                         break;  // do
166                     }
167 
168                     rbi = std::make_shared<OOoRuleBasedBreakIterator>(udata_open("OpenOffice", "brk",
169                         OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
170 
171                     if (U_SUCCESS(status))
172                     {
173                         icuBI->mpValue = std::make_shared<BI_ValueData>();
174                         icuBI->mpValue->mpBreakIterator = rbi;
175                         theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue));
176                     }
177                     else
178                     {
179                         rbi.reset();
180                     }
181                 }
182                 else
183                 {
184                     // language;rule (not langtag, unless we'd actually load such)
185                     OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());
186                     const OString aBIMapRuleKey( aLanguage + ";" + rule);
187                     aMapIt = theBIMap.find( aBIMapRuleKey);
188                     bInMap = (aMapIt != theBIMap.end());
189                     if (bInMap)
190                     {
191                         icuBI->mpValue = aMapIt->second;
192                         icuBI->maBIMapKey = aBIMapGlobalKey;
193                         theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
194                         break;  // do
195                     }
196 
197                     status = U_ZERO_ERROR;
198                     OString aUDName = OString::Concat(rule) + "_" + aLanguage;
199                     UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
200                     if( U_SUCCESS(status) )
201                         rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
202                     if ( U_SUCCESS(status) )
203                     {
204                         icuBI->mpValue = std::make_shared<BI_ValueData>();
205                         icuBI->mpValue->mpBreakIterator = rbi;
206                         theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue));
207                     }
208                     else
209                     {
210                         rbi.reset();
211 
212                         // ;rule (only)
213                         const OString aBIMapRuleOnlyKey( OString::Concat(";") + rule);
214                         aMapIt = theBIMap.find( aBIMapRuleOnlyKey);
215                         bInMap = (aMapIt != theBIMap.end());
216                         if (bInMap)
217                         {
218                             icuBI->mpValue = aMapIt->second;
219                             icuBI->maBIMapKey = aBIMapGlobalKey;
220                             theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
221                             break;  // do
222                         }
223 
224                         status = U_ZERO_ERROR;
225                         pUData = udata_open("OpenOffice", "brk", rule, &status);
226                         if( U_SUCCESS(status) )
227                             rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
228                         if ( U_SUCCESS(status) )
229                         {
230                             icuBI->mpValue = std::make_shared<BI_ValueData>();
231                             icuBI->mpValue->mpBreakIterator = rbi;
232                             theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue));
233                         }
234                         else
235                         {
236                             rbi.reset();
237                         }
238                     }
239                 }
240             } while (false);
241 
242         if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
243             do
244             {
245                 // langtag;;;rBreakType (empty rule; empty breakType)
246                 const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType));
247                 aMapIt = theBIMap.find( aBIMapLocaleTypeKey);
248                 bInMap = (aMapIt != theBIMap.end());
249                 if (bInMap)
250                 {
251                     icuBI->mpValue = aMapIt->second;
252                     icuBI->maBIMapKey = aBIMapGlobalKey;
253                     theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
254                     break;  // do
255                 }
256 
257                 icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
258                 std::shared_ptr< icu::BreakIterator > pBI;
259 
260                 status = U_ZERO_ERROR;
261                 switch (rBreakType) {
262                     case LOAD_CHARACTER_BREAKITERATOR:
263                         pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );
264                         break;
265                     case LOAD_WORD_BREAKITERATOR:
266                         pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );
267                         break;
268                     case LOAD_SENTENCE_BREAKITERATOR:
269                         pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );
270                         break;
271                     case LOAD_LINE_BREAKITERATOR:
272                         pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );
273                         break;
274                 }
275                 if ( !U_SUCCESS(status) || !pBI ) {
276                     throw uno::RuntimeException("Failed to create ICU BreakIterator: error " + OUString::createFromAscii(u_errorName(status)));
277                 }
278                 icuBI->mpValue = std::make_shared<BI_ValueData>();
279                 icuBI->mpValue->mpBreakIterator = std::move(pBI);
280                 theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue));
281             } while (false);
282         if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) {
283             throw uno::RuntimeException(u"ICU BreakIterator is not properly initialized"_ustr);
284         }
285         icuBI->maBIMapKey = aBIMapGlobalKey;
286         if (!bInMap)
287             theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
288         bNewBreak=true;
289     }
290 
291     if (!(bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData))
292         return;
293 
294     const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
295 
296     status = U_ZERO_ERROR;
297     icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status);
298 
299     if (!U_SUCCESS(status))
300         throw uno::RuntimeException("utext_openUChars returned error " + OUString::createFromAscii(u_errorName(status)));
301 
302     icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status);
303 
304     if (!U_SUCCESS(status))
305         throw uno::RuntimeException("Failed to set text for ICU BreakIterator: error " + OUString::createFromAscii(u_errorName(status)));
306 
307     icuBI->mpValue->maICUText = rText;
308 }
309 
nextCharacters(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int16 nCharacterIteratorMode,sal_Int32 nCount,sal_Int32 & nDone)310 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
311         sal_Int32 nStartPos, const lang::Locale &rLocale,
312         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
313 {
314     if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
315         loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
316         icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
317         for (nDone = 0; nDone < nCount; nDone++) {
318             nStartPos = pBI->following(nStartPos);
319             if (nStartPos == icu::BreakIterator::DONE)
320                 return Text.getLength();
321         }
322     } else { // for CHARACTER mode
323         for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
324             Text.iterateCodePoints(&nStartPos);
325     }
326     return nStartPos;
327 }
328 
previousCharacters(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int16 nCharacterIteratorMode,sal_Int32 nCount,sal_Int32 & nDone)329 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
330         sal_Int32 nStartPos, const lang::Locale& rLocale,
331         sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
332 {
333     if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
334         loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
335         icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
336         for (nDone = 0; nDone < nCount; nDone++) {
337             nStartPos = pBI->preceding(nStartPos);
338             if (nStartPos == icu::BreakIterator::DONE)
339                 return 0;
340         }
341     } else { // for BS to delete one char and CHARACTER mode.
342         for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
343             Text.iterateCodePoints(&nStartPos, -1);
344     }
345     return nStartPos;
346 }
347 
348 
nextWord(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int16 rWordType)349 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
350     const lang::Locale& rLocale, sal_Int16 rWordType )
351 {
352     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
353 
354     Boundary rv;
355     rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
356     if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
357         rv.endPos = rv.startPos;
358     else {
359         if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
360              && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))
361             || (rWordType == WordType::DICTIONARY_WORD
362                 && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0))))
363             rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
364 
365         rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
366         if(rv.endPos == icu::BreakIterator::DONE)
367             rv.endPos = rv.startPos;
368     }
369     return rv;
370 }
371 
372 
previousWord(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int16 rWordType)373 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
374         const lang::Locale& rLocale, sal_Int16 rWordType)
375 {
376     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
377 
378     Boundary rv;
379     rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos);
380     if( rv.startPos < 0)
381         rv.endPos = rv.startPos;
382     else {
383 
384         if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
385              && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))
386             || (rWordType == WordType::DICTIONARY_WORD
387                 && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0))))
388             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos);
389 
390         rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
391         if(rv.endPos == icu::BreakIterator::DONE)
392             rv.endPos = rv.startPos;
393     }
394     return rv;
395 }
396 
397 
getWordBoundary(const OUString & Text,sal_Int32 nPos,const lang::Locale & rLocale,sal_Int16 rWordType,sal_Bool bDirection)398 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
399         sal_Int16 rWordType, sal_Bool bDirection )
400 {
401     loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
402     sal_Int32 len = Text.getLength();
403 
404     Boundary rv;
405     if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) {
406         rv.startPos = rv.endPos = nPos;
407         if((bDirection || nPos == 0) && nPos < len) //forward
408             rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
409         else
410             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
411     } else {
412         if(nPos <= 0) {
413             rv.startPos = 0;
414             rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;
415         } else if(nPos >= len) {
416             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len);
417             rv.endPos = len;
418         } else {
419             rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
420             rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
421         }
422     }
423     if (rv.startPos == icu::BreakIterator::DONE)
424         rv.startPos = rv.endPos;
425     else if (rv.endPos == icu::BreakIterator::DONE)
426         rv.endPos = rv.startPos;
427 
428     return rv;
429 }
430 
431 
beginOfSentence(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale)432 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
433         const lang::Locale &rLocale )
434 {
435     loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
436 
437     sal_Int32 len = Text.getLength();
438     if (len > 0 && nStartPos == len)
439         Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
440     if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos))
441         nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos);
442 
443     // skip preceding space.
444     sal_uInt32 ch = Text.iterateCodePoints(&nStartPos);
445     while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos);
446     Text.iterateCodePoints(&nStartPos, -1);
447 
448     return nStartPos;
449 }
450 
endOfSentence(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale)451 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
452         const lang::Locale &rLocale )
453 {
454     loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
455 
456     sal_Int32 len = Text.getLength();
457     if (len > 0 && nStartPos == len)
458         Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
459     nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos);
460 
461     sal_Int32 nPos=nStartPos;
462     while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
463 
464     return nStartPos;
465 }
466 
getLineBreak(const OUString & Text,sal_Int32 nStartPos,const lang::Locale & rLocale,sal_Int32 nMinBreakPos,const LineBreakHyphenationOptions & hOptions,const LineBreakUserOptions &)467 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
468         const OUString& Text, sal_Int32 nStartPos,
469         const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
470         const LineBreakHyphenationOptions& hOptions,
471         const LineBreakUserOptions& /*rOptions*/ )
472 {
473     LineBreakResults lbr;
474 
475     if (nStartPos >= Text.getLength()) {
476         lbr.breakIndex = Text.getLength();
477         lbr.breakType = BreakType::WORDBOUNDARY;
478         return lbr;
479     }
480 
481     loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
482 
483     icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get();
484     bool GlueSpace=true;
485     while (GlueSpace) {
486         // don't break with Slash U+002F SOLIDUS at end of line; see "else" below!
487         if (pLineBI->preceding(nStartPos + 1) == nStartPos
488                 && (nStartPos == 0 || Text[nStartPos - 1] != '/'))
489         { //Line boundary break
490             lbr.breakIndex = nStartPos;
491             lbr.breakType = BreakType::WORDBOUNDARY;
492         } else if (hOptions.rHyphenator.is()) { //Hyphenation break
493             sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;
494             pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"
495 
496             sal_Int32 nStartPosWordEnd = nStartPos;
497             while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation
498                 nStartPosWordEnd --;
499 
500             Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,
501                 WordType::DICTIONARY_WORD, false);
502 
503             nStartPosWordEnd = wBoundary.endPos;
504             while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation
505                 nStartPosWordEnd ++;
506             nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
507             if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
508 #define SPACE 0x0020
509             while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);
510             uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
511                         wBoundary.endPos - wBoundary.startPos), rLocale,
512                     static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
513             if (aHyphenatedWord.is()) {
514                 lbr.rHyphenatedWord = aHyphenatedWord;
515                 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
516                     lbr.breakIndex = -1;
517                 else
518                     lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
519                 lbr.breakType = BreakType::HYPHENATION;
520 
521                 // check not optimal hyphenation of "word-word" (word with hyphens)
522                 if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {
523                     lbr.breakIndex = pLineBI->current();
524                     lbr.breakType = BreakType::WORDBOUNDARY;
525                 }
526 
527             } else {
528                 lbr.breakIndex = pLineBI->preceding(nStartPos);
529                 lbr.breakType = BreakType::WORDBOUNDARY;
530             }
531         } else { //word boundary break
532             lbr.breakIndex = pLineBI->preceding(nStartPos);
533             lbr.breakType = BreakType::WORDBOUNDARY;
534 
535             // Special case for Slash U+002F SOLIDUS in URI and path names.
536             // TR14 defines that as SY: Symbols Allowing Break After (A).
537             // This is unwanted in paths, see also i#17155
538             if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/')
539             {
540                 // Look backward and take any whitespace before as a break
541                 // opportunity. This also glues something like "w/o".
542                 // Avoid an overly long path and break it as was indicated.
543                 // Overly long here is arbitrarily defined.
544                 const sal_Int32 nOverlyLong = 66;
545                 sal_Int32 nPos = lbr.breakIndex - 1;
546                 while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong)
547                 {
548                     if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1)))
549                     {
550                         lbr.breakIndex = nPos + 1;
551                         break;
552                     }
553                 }
554             }
555         }
556 
557 #define WJ 0x2060   // Word Joiner
558         GlueSpace=false;
559         if (lbr.breakType == BreakType::WORDBOUNDARY) {
560             nStartPos = lbr.breakIndex;
561             if (nStartPos >= 0 && Text[nStartPos--] == WJ)
562                 GlueSpace=true;
563             while (nStartPos >= 0 &&
564                     (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
565                 if (Text[nStartPos--] == WJ)
566                     GlueSpace=true;
567             }
568             if (GlueSpace && nStartPos < 0)  {
569                 lbr.breakIndex = 0;
570                 break;
571             }
572         }
573     }
574 
575     return lbr;
576 }
577 
578 OUString SAL_CALL
getImplementationName()579 BreakIterator_Unicode::getImplementationName()
580 {
581     return cBreakIterator;
582 }
583 
584 sal_Bool SAL_CALL
supportsService(const OUString & rServiceName)585 BreakIterator_Unicode::supportsService(const OUString& rServiceName)
586 {
587     return cppu::supportsService(this, rServiceName);
588 }
589 
590 uno::Sequence< OUString > SAL_CALL
getSupportedServiceNames()591 BreakIterator_Unicode::getSupportedServiceNames()
592 {
593     return{ cBreakIterator };
594 }
595 
596 }
597 
598 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
com_sun_star_i18n_BreakIterator_Unicode_get_implementation(css::uno::XComponentContext *,css::uno::Sequence<css::uno::Any> const &)599 com_sun_star_i18n_BreakIterator_Unicode_get_implementation(
600     css::uno::XComponentContext *,
601     css::uno::Sequence<css::uno::Any> const &)
602 {
603     return cppu::acquire(new i18npool::BreakIterator_Unicode());
604 }
605 
606 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
607