1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <com/sun/star/uno/Reference.h>
21 
22 #include <comphelper/sequence.hxx>
23 #include <comphelper/processfactory.hxx>
24 #include <cppuhelper/factory.hxx>
25 #include <cppuhelper/supportsservice.hxx>
26 #include <cppuhelper/weak.hxx>
27 #include <com/sun/star/linguistic2/XLinguProperties.hpp>
28 #include <com/sun/star/linguistic2/LinguServiceManager.hpp>
29 #include <com/sun/star/linguistic2/XSpellChecker1.hpp>
30 #include <i18nlangtag/languagetag.hxx>
31 #include <tools/debug.hxx>
32 #include <osl/mutex.hxx>
33 #include <osl/thread.h>
34 
35 #include <hyphen.h>
36 #include "hyphenimp.hxx"
37 
38 #include <linguistic/hyphdta.hxx>
39 #include <rtl/ustring.hxx>
40 #include <rtl/ustrbuf.hxx>
41 #include <rtl/textenc.h>
42 #include <sal/log.hxx>
43 
44 #include <linguistic/misc.hxx>
45 #include <svtools/strings.hrc>
46 #include <unotools/charclass.hxx>
47 #include <unotools/lingucfg.hxx>
48 #include <unotools/resmgr.hxx>
49 #include <osl/file.hxx>
50 
51 #include <stdio.h>
52 #include <string.h>
53 
54 #include <cassert>
55 #include <numeric>
56 #include <vector>
57 #include <set>
58 #include <memory>
59 #include <o3tl/string_view.hxx>
60 
61 // XML-header to query SPELLML support
62 constexpr OUStringLiteral SPELLML_SUPPORT = u"<?xml?>";
63 
64 using namespace osl;
65 using namespace com::sun::star;
66 using namespace com::sun::star::beans;
67 using namespace com::sun::star::lang;
68 using namespace com::sun::star::uno;
69 using namespace com::sun::star::linguistic2;
70 using namespace linguistic;
71 
GetLngSvcMgr_Impl()72 static uno::Reference< XLinguServiceManager2 > GetLngSvcMgr_Impl()
73 {
74     uno::Reference< XComponentContext > xContext( comphelper::getProcessComponentContext() );
75     uno::Reference< XLinguServiceManager2 > xRes = LinguServiceManager::create( xContext ) ;
76     return xRes;
77 }
78 
Hyphenator()79 Hyphenator::Hyphenator() :
80     aEvtListeners   ( GetLinguMutex() )
81 {
82     bDisposing = false;
83 }
84 
~Hyphenator()85 Hyphenator::~Hyphenator()
86 {
87     for (auto & rInfo : mvDicts)
88     {
89         if (rInfo.aPtr)
90             hnj_hyphen_free(rInfo.aPtr);
91     }
92 
93     if (pPropHelper)
94     {
95         pPropHelper->RemoveAsPropListener();
96     }
97 }
98 
GetPropHelper_Impl()99 PropertyHelper_Hyphenation& Hyphenator::GetPropHelper_Impl()
100 {
101     if (!pPropHelper)
102     {
103         Reference< XLinguProperties >   xPropSet = GetLinguProperties();
104 
105         pPropHelper.reset( new PropertyHelper_Hyphenation (static_cast<XHyphenator *>(this), xPropSet ) );
106         pPropHelper->AddAsPropListener();   //! after a reference is established
107     }
108     return *pPropHelper;
109 }
110 
getLocales()111 Sequence< Locale > SAL_CALL Hyphenator::getLocales()
112 {
113     MutexGuard  aGuard( GetLinguMutex() );
114 
115     // this routine should return the locales supported by the installed
116     // dictionaries.
117     if (mvDicts.empty())
118     {
119         SvtLinguConfig aLinguCfg;
120 
121         // get list of dictionaries-to-use
122         // (or better speaking: the list of dictionaries using the
123         // new configuration entries).
124         std::vector< SvtLinguConfigDictionaryEntry > aDics;
125         uno::Sequence< OUString > aFormatList;
126         aLinguCfg.GetSupportedDictionaryFormatsFor( u"Hyphenators"_ustr,
127                 u"org.openoffice.lingu.LibHnjHyphenator"_ustr, aFormatList );
128         for (const auto& rFormat : aFormatList)
129         {
130             std::vector< SvtLinguConfigDictionaryEntry > aTmpDic(
131                     aLinguCfg.GetActiveDictionariesByFormat( rFormat ) );
132             aDics.insert( aDics.end(), aTmpDic.begin(), aTmpDic.end() );
133         }
134 
135         //!! for compatibility with old dictionaries (the ones not using extensions
136         //!! or new configuration entries, but still using the dictionary.lst file)
137         //!! Get the list of old style spell checking dictionaries to use...
138         std::vector< SvtLinguConfigDictionaryEntry > aOldStyleDics(
139                 GetOldStyleDics( "HYPH" ) );
140 
141         // to prefer dictionaries with configuration entries we will only
142         // use those old style dictionaries that add a language that
143         // is not yet supported by the list of new style dictionaries
144         MergeNewStyleDicsAndOldStyleDics( aDics, aOldStyleDics );
145 
146         if (!aDics.empty())
147         {
148             // get supported locales from the dictionaries-to-use...
149             std::set<OUString> aLocaleNamesSet;
150             for (auto const& dict : aDics)
151             {
152                 for (const auto& rLocaleName : dict.aLocaleNames)
153                 {
154                     aLocaleNamesSet.insert( rLocaleName );
155                 }
156             }
157             // ... and add them to the resulting sequence
158             std::vector<Locale> aLocalesVec;
159             aLocalesVec.reserve(aLocaleNamesSet.size());
160 
161             std::transform(aLocaleNamesSet.begin(), aLocaleNamesSet.end(), std::back_inserter(aLocalesVec),
162                 [](const OUString& localeName) { return LanguageTag::convertToLocale(localeName); });
163 
164             aSuppLocales = comphelper::containerToSequence(aLocalesVec);
165 
166             //! For each dictionary and each locale we need a separate entry.
167             //! If this results in more than one dictionary per locale than (for now)
168             //! it is undefined which dictionary gets used.
169             //! In the future the implementation should support using several dictionaries
170             //! for one locale.
171             sal_Int32 numdict = std::accumulate(aDics.begin(), aDics.end(), 0,
172                 [](const sal_Int32 nSum, const SvtLinguConfigDictionaryEntry& dict) {
173                     return nSum + dict.aLocaleNames.getLength(); });
174 
175             // add dictionary information
176             mvDicts.resize(numdict);
177 
178             sal_Int32 k = 0;
179             for (auto const& dict :  aDics)
180             {
181                 if (dict.aLocaleNames.hasElements() &&
182                     dict.aLocations.hasElements())
183                 {
184                     // currently only one language per dictionary is supported in the actual implementation...
185                     // Thus here we work-around this by adding the same dictionary several times.
186                     // Once for each of its supported locales.
187                     for (const auto& rLocaleName : dict.aLocaleNames)
188                     {
189                         LanguageTag aLanguageTag(rLocaleName);
190                         mvDicts[k].aPtr = nullptr;
191                         mvDicts[k].eEnc = RTL_TEXTENCODING_DONTKNOW;
192                         mvDicts[k].aLoc = aLanguageTag.getLocale();
193                         mvDicts[k].apCC.reset( new CharClass( std::move(aLanguageTag) ) );
194                         // also both files have to be in the same directory and the
195                         // file names must only differ in the extension (.aff/.dic).
196                         // Thus we use the first location only and strip the extension part.
197                         OUString aLocation = dict.aLocations[0];
198                         sal_Int32 nPos = aLocation.lastIndexOf( '.' );
199                         aLocation = aLocation.copy( 0, nPos );
200                         mvDicts[k].aName = aLocation;
201 
202                         ++k;
203                     }
204                 }
205             }
206             DBG_ASSERT( k == numdict, "index mismatch?" );
207         }
208         else
209         {
210             // no dictionary found so register no dictionaries
211             mvDicts.clear();
212             aSuppLocales.realloc(0);
213         }
214     }
215 
216     return aSuppLocales;
217 }
218 
hasLocale(const Locale & rLocale)219 sal_Bool SAL_CALL Hyphenator::hasLocale(const Locale& rLocale)
220 {
221     MutexGuard  aGuard( GetLinguMutex() );
222 
223     if (!aSuppLocales.hasElements())
224         getLocales();
225 
226     return comphelper::findValue(aSuppLocales, rLocale) != -1;
227 }
228 
229 namespace {
LoadDictionary(HDInfo & rDict)230 bool LoadDictionary(HDInfo& rDict)
231 {
232     OUString DictFN = rDict.aName + ".dic";
233     OUString dictpath;
234 
235     osl::FileBase::getSystemPathFromFileURL(DictFN, dictpath);
236 
237 #if defined(_WIN32)
238     // hnj_hyphen_load expects UTF-8 encoded paths with \\?\ long path prefix.
239     OString sTmp = Win_AddLongPathPrefix(OUStringToOString(dictpath, RTL_TEXTENCODING_UTF8));
240 #else
241     OString sTmp(OU2ENC(dictpath, osl_getThreadTextEncoding()));
242 #endif
243     HyphenDict *dict = nullptr;
244     if ((dict = hnj_hyphen_load(sTmp.getStr())) == nullptr)
245     {
246         SAL_WARN(
247             "lingucomponent",
248             "Couldn't find file " << dictpath);
249         return false;
250     }
251     rDict.aPtr = dict;
252     rDict.eEnc = getTextEncodingFromCharset(dict->cset);
253     return true;
254 }
255 }
256 
hyphenate(const OUString & aWord,const css::lang::Locale & aLocale,sal_Int16 nMaxLeading,const css::uno::Sequence<css::beans::PropertyValue> & aProperties)257 Reference< XHyphenatedWord > SAL_CALL Hyphenator::hyphenate( const OUString& aWord,
258        const css::lang::Locale& aLocale,
259        sal_Int16 nMaxLeading,
260        const css::uno::Sequence< css::beans::PropertyValue >& aProperties )
261 {
262     PropertyHelper_Hyphenation& rHelper = GetPropHelper();
263     rHelper.SetTmpPropVals(aProperties);
264     sal_Int16 minTrail = rHelper.GetMinTrailing();
265     sal_Int16 minLead = rHelper.GetMinLeading();
266     sal_Int16 minCompoundLead = rHelper.GetCompoundMinLeading();
267     sal_Int16 minLen = rHelper.GetMinWordLength();
268     bool bNoHyphenateCaps = rHelper.IsNoHyphenateCaps();
269 
270     rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
271 
272     Reference< XHyphenatedWord > xRes;
273 
274     int k = -1;
275     for (size_t j = 0; j < mvDicts.size(); ++j)
276     {
277         if (aLocale == mvDicts[j].aLoc)
278             k = j;
279     }
280 
281     // if we have a hyphenation dictionary matching this locale
282     if (k != -1)
283     {
284         int nHyphenationPos = -1;
285         int nHyphenationPosAlt = -1;
286         int nHyphenationPosAltHyph = -1;
287 
288         // if this dictionary has not been loaded yet do that
289         if (!mvDicts[k].aPtr)
290         {
291             if (!LoadDictionary(mvDicts[k]))
292                 return nullptr;
293         }
294 
295         // otherwise hyphenate the word with that dictionary
296         HyphenDict *dict = mvDicts[k].aPtr;
297         eEnc = mvDicts[k].eEnc;
298         CharClass * pCC =  mvDicts[k].apCC.get();
299 
300         // Don't hyphenate uppercase words if requested
301         if (bNoHyphenateCaps && aWord == makeUpperCase(aWord, pCC))
302         {
303             return nullptr;
304         }
305 
306         // we don't want to work with a default text encoding since following incorrect
307         // results may occur only for specific text and thus may be hard to notice.
308         // Thus better always make a clean exit here if the text encoding is in question.
309         // Hopefully something not working at all will raise proper attention quickly. ;-)
310         DBG_ASSERT( eEnc != RTL_TEXTENCODING_DONTKNOW, "failed to get text encoding! (maybe incorrect encoding string in file)" );
311         if (eEnc == RTL_TEXTENCODING_DONTKNOW)
312             return nullptr;
313 
314         CapType ct = capitalType(aWord, pCC);
315 
316         // first convert any smart quotes or apostrophes to normal ones
317         OUStringBuffer rBuf(aWord);
318         sal_Int32 nc = rBuf.getLength();
319         sal_Unicode ch;
320         for (sal_Int32 ix=0; ix < nc; ix++)
321         {
322             ch = rBuf[ix];
323             if ((ch == 0x201C) || (ch == 0x201D))
324                 rBuf[ix] = u'"';
325             if ((ch == 0x2018) || (ch == 0x2019))
326                 rBuf[ix] = u'\'';
327         }
328         OUString nWord(rBuf.makeStringAndClear());
329 
330         // now convert word to all lowercase for pattern recognition
331         OUString nTerm(makeLowerCase(nWord, pCC));
332 
333         // now convert word to needed encoding
334         OString encWord(OU2ENC(nTerm,eEnc));
335 
336         int wordlen = encWord.getLength();
337         std::unique_ptr<char[]> lcword(new char[wordlen + 1]);
338         std::unique_ptr<char[]> hyphens(new char[wordlen + 5]);
339 
340         char ** rep = nullptr; // replacements of discretionary hyphenation
341         int * pos = nullptr; // array of [hyphenation point] minus [deletion position]
342         int * cut = nullptr; // length of deletions in original word
343 
344         // copy converted word into simple char buffer
345         strcpy(lcword.get(),encWord.getStr());
346 
347         // now strip off any ending periods
348         int n = wordlen-1;
349         while((n >=0) && (lcword[n] == '.'))
350             n--;
351         n++;
352         if (n > 0)
353         {
354             const bool bFailed = 0 != hnj_hyphen_hyphenate3( dict, lcword.get(), n, hyphens.get(), nullptr,
355                     &rep, &pos, &cut, minLead, minTrail,
356                     std::max<sal_Int16>(dict->clhmin, std::max<sal_Int16>(dict->clhmin, 2) + std::max(0, minLead  - std::max<sal_Int16>(dict->lhmin, 2))),
357                     std::max<sal_Int16>(dict->crhmin, std::max<sal_Int16>(dict->crhmin, 2) + std::max(0, minTrail - std::max<sal_Int16>(dict->rhmin, 2))) );
358             if (bFailed)
359             {
360                 // whoops something did not work
361                 if (rep)
362                 {
363                     for(int j = 0; j < n; j++)
364                     {
365                         if (rep[j]) free(rep[j]);
366                     }
367                     free(rep);
368                 }
369                 if (pos) free(pos);
370                 if (cut) free(cut);
371                 return nullptr;
372             }
373         }
374 
375         // now backfill hyphens[] for any removed trailing periods
376         for (int c = n; c < wordlen; c++) hyphens[c] = '0';
377         hyphens[wordlen] = '\0';
378 
379         sal_Int32 Leading =  GetPosInWordToCheck( aWord, nMaxLeading );
380 
381         // use morphological analysis of Hunspell to get better hyphenation of compound words
382         // optionally when hyphenation zone is enabled
383         // pa: fields contain stems resulted by compound word analysis of non-dictionary words
384         // hy: fields contain hyphenation data of dictionary (compound) words
385         Reference< XSpellAlternatives > xTmpRes;
386         bool bAnalyzed = false; // enough the analyse once the word
387         bool bCompoundHyphenation = true; // try to hyphenate compound words better
388         OUString sStems; // processed result of the compound word analysis, e.g. com|pound|word
389         sal_Int32 nSuffixLen = 0; // do not remove break points in suffixes
390 
391         for (sal_Int32 i = 0; i < n; i++)
392         {
393             int leftrep = 0;
394             bool hit = (n >= minLen);
395             if (!rep || !rep[i])
396             {
397                 hit = hit && (hyphens[i]&1) && (i < Leading);
398                 hit = hit && (i >= (minLead-1) );
399                 hit = hit && ((n - i - 1) >= minTrail);
400             }
401             else
402             {
403                 // calculate change character length before hyphenation point signed with '='
404                 for (char * c = rep[i]; *c && (*c != '='); c++)
405                 {
406                     if (eEnc == RTL_TEXTENCODING_UTF8)
407                     {
408                         if (static_cast<unsigned char>(*c) >> 6 != 2)
409                             leftrep++;
410                     }
411                     else
412                         leftrep++;
413                 }
414                 hit = hit && (hyphens[i]&1) && ((i + leftrep - pos[i]) < Leading);
415                 hit = hit && ((i + leftrep - pos[i]) >= (minLead-1) );
416                 hit = hit && ((n - i - 1 + sal::static_int_cast< sal_sSize >(strlen(rep[i])) - leftrep - 1) >= minTrail);
417             }
418             if (hit)
419             {
420                 // skip hyphenation right after stem boundaries in compound words
421                 // if minCompoundLead > 2 (default value: less than n=minCompoundLead character distance)
422                 if ( bCompoundHyphenation && minCompoundLead > 2 && nHyphenationPos > -1 && i - nHyphenationPos < minCompoundLead )
423                 {
424                     uno::Reference< XLinguServiceManager2 > xLngSvcMgr( GetLngSvcMgr_Impl() );
425                     uno::Reference< XSpellChecker1 > xSpell;
426 
427                     LanguageType nLanguage = LinguLocaleToLanguage( aLocale );
428 
429                     xSpell.set( xLngSvcMgr->getSpellChecker(), UNO_QUERY );
430 
431                     // get morphological analysis of the word
432                     if ( ( bAnalyzed && xTmpRes.is() ) || ( xSpell.is() && xSpell->isValid(
433                             SPELLML_SUPPORT, static_cast<sal_uInt16>(nLanguage),
434                             uno::Sequence< beans::PropertyValue >() ) ) )
435                     {
436                         if ( !bAnalyzed )
437                         {
438                             xTmpRes = xSpell->spell( "<?xml?><query type='analyze'><word>" +
439                                                        aWord + "</word></query>",
440                                                static_cast<sal_uInt16>(nLanguage),
441                                                uno::Sequence< beans::PropertyValue >() );
442                             bAnalyzed = true;
443 
444                             if (xTmpRes.is())
445                             {
446                                 Sequence<OUString>seq = xTmpRes->getAlternatives();
447                                 if (seq.hasElements())
448                                 {
449                                     sal_Int32 nEndOfFirstAnalysis = seq[0].indexOf("</a>");
450                                     // FIXME use only the first analysis
451                                     OUString morph(
452                                             seq[0].copy(0, nEndOfFirstAnalysis));
453 
454                                     // concatenate pa: fields, i.e. stems in the analysis:
455                                     // pa:stem1 pa:stem2 pa:stem3 -> stem1||stem2||stem3
456                                     sal_Int32 nPa = -1;
457                                     while ( (nPa = morph.indexOf(u" pa:", nPa + 1)) > -1 )
458                                     {
459                                         // use hy: field of the actual stem, if it exists
460                                         // pa:stem1 hy:st|em1 pa:stem2 -> st|em1||stem2
461                                         sal_Int32 nHy = morph.indexOf(u" hy:", nPa + 3);
462                                         sal_Int32 nPa2 = morph.indexOf(u" pa:", nPa + 3);
463 
464                                         if ( nHy > -1 && ( nPa2 == -1 || nHy < nPa2 ) )
465                                         {
466                                             OUString sStems2(morph.getToken(1, ' ', nHy).copy(3));
467                                             if ( sStems2.indexOf('|') > -1 )
468                                                 sStems += sStems2+ u"||";
469                                             else if ( sal_Int32 nBreak = o3tl::toInt32(sStems2) )
470                                             {
471                                                 OUString sPa(morph.getToken(1, ' ', nPa).copy(3));
472                                                 if ( nBreak < sPa.getLength() )
473                                                     sStems += OUString::Concat(sPa.subView(0, nBreak)) + u"|" +
474                                                            sPa.subView(nBreak);
475                                             }
476                                         }
477                                         else
478                                         {
479                                             OUString sPa(morph.getToken(1, ' ', nPa).copy(3));
480 
481                                             // handle special case: missing pa: in morphological analysis
482                                             // before in-word suffixes (German, Sweden etc. dictionaries)
483                                             // (recognized by the single last pa:)
484                                             if (sStems.isEmpty() && nPa2 == -1 && aWord.endsWith(sPa))
485                                             {
486                                                 sStems = OUString::Concat(aWord.subView(0, aWord.getLength() -
487                                                              sPa.getLength())) + u"||" +
488                                                          aWord.subView(aWord.getLength() -
489                                                              sPa.getLength());
490                                                 break;
491                                             }
492 
493                                             sStems += sPa + "||";
494 
495                                             // count suffix length
496                                             sal_Int32 nSt = morph.lastIndexOf(" st:");
497                                             if ( nSt > -1 )
498                                             {
499                                                 sal_Int32 nStemLen =
500                                                     o3tl::getToken(morph, 1, ' ', nSt).length() - 3;
501                                                 if ( nStemLen < sPa.getLength() )
502                                                     nSuffixLen = sPa.getLength() - nStemLen;
503                                             }
504                                         }
505 
506                                         if ( nPa == -1 ) // getToken() can modify nPa
507                                             break;
508                                     }
509 
510                                     // only hy:, but not pa:
511                                     if ( sStems.isEmpty() )
512                                     {
513                                         // check hy: (pre-defined hyphenation)
514                                         sal_Int32 nHy = morph.indexOf(" hy:");
515                                         if (nHy > -1)
516                                         {
517                                             sStems = morph.getToken(1, ' ', nHy).copy(3);
518                                             if ( sStems.indexOf('|') == -1 && sStems.indexOf('-') == -1 )
519                                             {
520                                                 if ( sal_Int32 nBreak = o3tl::toInt32(sStems) )
521                                                 {
522                                                     if ( nBreak < aWord.getLength() )
523                                                         sStems += OUString::Concat(aWord.subView(0, nBreak)) + u"|" +
524                                                                aWord.subView(nBreak);
525                                                 }
526                                             }
527                                         }
528                                     }
529                                 }
530                             }
531                         }
532 
533                         // handle string separated by |, e.g "program hy:pro|gram"
534                         if ( sStems.indexOf('|') > -1 )
535                         {
536                             sal_Int32 nLetters = 0; // count not separator characters
537                             sal_Int32 nSepPos = -1; // position of last character | used for stem boundaries
538                             bool bWeightedSep = false; // double separator || = weighted stem boundary
539                             sal_Int32 j = 0;
540                             for (; j < sStems.getLength() && nLetters <= i; j++)
541                             {
542                                 if ( sStems[j] == '|' )
543                                 {
544                                     bWeightedSep = nSepPos > -1 && (j - 1 == nSepPos);
545                                     nSepPos = j;
546                                 }
547                                 else if ( sStems[j] != '-' && sStems[j] != '=' && sStems[j] != '*' )
548                                     ++nLetters;
549                             }
550                             // skip break points near stem boundaries
551                             if (
552                                 // there is a stem boundary before the actual break point
553                                 nSepPos > -1 &&
554                                 // and the break point is within a stem, i.e. not in the
555                                 // suffix of the last stem
556                                 i < aWord.getLength() - nSuffixLen - 1 &&
557                                 // and it is not another stem boundary
558                                 j + 1 < sStems.getLength() &&
559                                 ( sStems[j + 1] != u'|' ||
560                                 // except if it's only the previous was a weighted one
561                                     ( bWeightedSep && ( j + 2 == sStems.getLength() ||
562                                                         sStems[j + 2] != u'|' ) ) ) )
563                             {
564                                 continue;
565                             }
566                         }
567                         else
568                             // not a compound word
569                             bCompoundHyphenation = false;
570                     }
571                     else
572                         // no SPELLML support, no morphological analysis
573                         bCompoundHyphenation = false;
574                 }
575 
576                 nHyphenationPos = i;
577                 if (rep && rep[i])
578                 {
579                     nHyphenationPosAlt = i - pos[i];
580                     nHyphenationPosAltHyph = i + leftrep - pos[i];
581                 }
582             }
583         }
584 
585         if (nHyphenationPos  == -1)
586         {
587             xRes = nullptr;
588         }
589         else
590         {
591             if (rep && rep[nHyphenationPos])
592             {
593                 // remove equal sign
594                 char * s = rep[nHyphenationPos];
595                 int eq = 0;
596                 for (; *s; s++)
597                 {
598                     if (*s == '=') eq = 1;
599                     if (eq) *s = *(s + 1);
600                 }
601                 OUString repHyphlow(rep[nHyphenationPos], strlen(rep[nHyphenationPos]), eEnc);
602                 OUString repHyph;
603                 switch (ct)
604                 {
605                     case CapType::ALLCAP:
606                     {
607                         repHyph = makeUpperCase(repHyphlow, pCC);
608                         break;
609                     }
610                     case CapType::INITCAP:
611                     {
612                         if (nHyphenationPosAlt == -1)
613                             repHyph = makeInitCap(repHyphlow, pCC);
614                         else
615                              repHyph = repHyphlow;
616                         break;
617                     }
618                     default:
619                     {
620                         repHyph = repHyphlow;
621                         break;
622                     }
623                 }
624 
625                 // handle shortening
626                 sal_Int16 nPos = static_cast<sal_Int16>((nHyphenationPosAltHyph < nHyphenationPos) ?
627                 nHyphenationPosAltHyph : nHyphenationPos);
628                 // discretionary hyphenation
629                 xRes = HyphenatedWord::CreateHyphenatedWord( aWord, LinguLocaleToLanguage( aLocale ), nPos,
630                     aWord.replaceAt(nHyphenationPosAlt + 1, cut[nHyphenationPos], repHyph),
631                     static_cast<sal_Int16>(nHyphenationPosAltHyph));
632             }
633             else
634             {
635                 xRes = HyphenatedWord::CreateHyphenatedWord( aWord, LinguLocaleToLanguage( aLocale ),
636                     static_cast<sal_Int16>(nHyphenationPos), aWord, static_cast<sal_Int16>(nHyphenationPos));
637             }
638         }
639 
640         if (rep)
641         {
642             for(int j = 0; j < n; j++)
643             {
644                 if (rep[j]) free(rep[j]);
645             }
646             free(rep);
647         }
648         if (pos) free(pos);
649         if (cut) free(cut);
650         return xRes;
651     }
652     return nullptr;
653 }
654 
queryAlternativeSpelling(const OUString & aWord,const css::lang::Locale & aLocale,sal_Int16 nIndex,const css::uno::Sequence<css::beans::PropertyValue> & aProperties)655 Reference < XHyphenatedWord > SAL_CALL Hyphenator::queryAlternativeSpelling(
656         const OUString& aWord,
657         const css::lang::Locale& aLocale,
658         sal_Int16 nIndex,
659         const css::uno::Sequence< css::beans::PropertyValue >& aProperties )
660 {
661     // Firstly we allow only one plus character before the hyphen to avoid to miss the right break point:
662     for (int extrachar = 1; extrachar <= 2; extrachar++)
663     {
664         Reference< XHyphenatedWord > xRes = hyphenate(aWord, aLocale, nIndex + 1 + extrachar, aProperties);
665         if (xRes.is() && xRes->isAlternativeSpelling() && xRes->getHyphenationPos() == nIndex)
666             return xRes;
667     }
668     return nullptr;
669 }
670 
createPossibleHyphens(const OUString & aWord,const css::lang::Locale & aLocale,const css::uno::Sequence<css::beans::PropertyValue> & aProperties)671 Reference< XPossibleHyphens > SAL_CALL Hyphenator::createPossibleHyphens( const OUString& aWord,
672         const css::lang::Locale& aLocale,
673         const css::uno::Sequence< css::beans::PropertyValue >& aProperties )
674 {
675     PropertyHelper_Hyphenation& rHelper = GetPropHelper();
676     rHelper.SetTmpPropVals(aProperties);
677     sal_Int16 minTrail = rHelper.GetMinTrailing();
678     sal_Int16 minLead = rHelper.GetMinLeading();
679     sal_Int16 minLen = rHelper.GetMinWordLength();
680 
681     // Resolves: fdo#41083 honour MinWordLength in "createPossibleHyphens" as
682     // well as "hyphenate"
683     if (aWord.getLength() < minLen)
684     {
685         return PossibleHyphens::CreatePossibleHyphens( aWord, LinguLocaleToLanguage( aLocale ),
686                       aWord, Sequence< sal_Int16 >() );
687     }
688 
689     int k = -1;
690     for (size_t j = 0; j < mvDicts.size(); ++j)
691     {
692         if (aLocale == mvDicts[j].aLoc)
693             k = j;
694     }
695 
696     // if we have a hyphenation dictionary matching this locale
697     if (k != -1)
698     {
699         HyphenDict *dict = nullptr;
700         // if this dictionary has not been loaded yet do that
701         if (!mvDicts[k].aPtr)
702         {
703             if (!LoadDictionary(mvDicts[k]))
704                 return nullptr;
705         }
706 
707         // otherwise hyphenate the word with that dictionary
708         dict = mvDicts[k].aPtr;
709         rtl_TextEncoding eEnc = mvDicts[k].eEnc;
710         CharClass* pCC = mvDicts[k].apCC.get();
711 
712         // we don't want to work with a default text encoding since following incorrect
713         // results may occur only for specific text and thus may be hard to notice.
714         // Thus better always make a clean exit here if the text encoding is in question.
715         // Hopefully something not working at all will raise proper attention quickly. ;-)
716         DBG_ASSERT( eEnc != RTL_TEXTENCODING_DONTKNOW, "failed to get text encoding! (maybe incorrect encoding string in file)" );
717         if (eEnc == RTL_TEXTENCODING_DONTKNOW)
718             return nullptr;
719 
720         // first handle smart quotes both single and double
721         OUStringBuffer rBuf(aWord);
722         sal_Int32 nc = rBuf.getLength();
723         sal_Unicode ch;
724         for (sal_Int32 ix=0; ix < nc; ix++)
725         {
726             ch = rBuf[ix];
727             if ((ch == 0x201C) || (ch == 0x201D))
728                 rBuf[ix] = u'"';
729             if ((ch == 0x2018) || (ch == 0x2019))
730                 rBuf[ix] = u'\'';
731         }
732         OUString nWord(rBuf.makeStringAndClear());
733 
734         // now convert word to all lowercase for pattern recognition
735         OUString nTerm(makeLowerCase(nWord, pCC));
736 
737         // now convert word to needed encoding
738         OString encWord(OU2ENC(nTerm,eEnc));
739 
740         sal_Int32 wordlen = encWord.getLength();
741         std::unique_ptr<char[]> lcword(new char[wordlen+1]);
742         std::unique_ptr<char[]> hyphens(new char[wordlen+5]);
743         char ** rep = nullptr; // replacements of discretionary hyphenation
744         int * pos = nullptr; // array of [hyphenation point] minus [deletion position]
745         int * cut = nullptr; // length of deletions in original word
746 
747         // copy converted word into simple char buffer
748         strcpy(lcword.get(),encWord.getStr());
749 
750         // first remove any trailing periods
751         sal_Int32 n = wordlen-1;
752         while((n >=0) && (lcword[n] == '.'))
753             n--;
754         n++;
755         if (n > 0)
756         {
757             const bool bFailed = 0 != hnj_hyphen_hyphenate3(dict, lcword.get(), n, hyphens.get(), nullptr,
758                     &rep, &pos, &cut, minLead, minTrail,
759                     std::max<sal_Int16>(dict->clhmin, std::max<sal_Int16>(dict->clhmin, 2) + std::max(0, minLead - std::max<sal_Int16>(dict->lhmin, 2))),
760                     std::max<sal_Int16>(dict->crhmin, std::max<sal_Int16>(dict->crhmin, 2) + std::max(0, minTrail - std::max<sal_Int16>(dict->rhmin, 2))) );
761             if (bFailed)
762             {
763                 if (rep)
764                 {
765                     for(int j = 0; j < n; j++)
766                     {
767                         if (rep[j]) free(rep[j]);
768                     }
769                     free(rep);
770                 }
771                 if (pos) free(pos);
772                 if (cut) free(cut);
773 
774                 return nullptr;
775             }
776         }
777         // now backfill hyphens[] for any removed periods
778         for (sal_Int32 c = n; c < wordlen; c++)
779             hyphens[c] = '0';
780         hyphens[wordlen] = '\0';
781 
782         sal_Int32 nHyphCount = 0;
783 
784         for ( sal_Int32 i = 0; i < encWord.getLength(); i++)
785         {
786             if (hyphens[i]&1)
787                 nHyphCount++;
788         }
789 
790         Sequence< sal_Int16 > aHyphPos(nHyphCount);
791         sal_Int16 *pPos = aHyphPos.getArray();
792         OUStringBuffer hyphenatedWordBuffer;
793         nHyphCount = 0;
794 
795         for (sal_Int32 i = 0; i < nWord.getLength(); i++)
796         {
797             hyphenatedWordBuffer.append(aWord[i]);
798             // hyphenation position
799             if (hyphens[i]&1)
800             {
801                 // linguistic::PossibleHyphens is stuck with
802                 // css::uno::Sequence<sal_Int16> because of
803                 // css.linguistic2.XPossibleHyphens.getHyphenationPositions, so
804                 // any further positions need to be ignored:
805                 assert(i >= SAL_MIN_INT16);
806                 if (i > SAL_MAX_INT16)
807                 {
808                     SAL_WARN(
809                         "lingucomponent",
810                         "hyphen pos " << i << " > SAL_MAX_INT16 in \"" << aWord
811                             << "\"");
812                     continue;
813                 }
814                 pPos[nHyphCount] = i;
815                 hyphenatedWordBuffer.append('=');
816                 nHyphCount++;
817             }
818         }
819 
820         OUString hyphenatedWord = hyphenatedWordBuffer.makeStringAndClear();
821 
822         Reference< XPossibleHyphens > xRes = PossibleHyphens::CreatePossibleHyphens(
823             aWord, LinguLocaleToLanguage( aLocale ), hyphenatedWord, aHyphPos);
824 
825         if (rep)
826         {
827             for(int j = 0; j < n; j++)
828             {
829                 if (rep[j]) free(rep[j]);
830             }
831             free(rep);
832         }
833         if (pos) free(pos);
834         if (cut) free(cut);
835 
836         return xRes;
837     }
838 
839     return nullptr;
840 }
841 
makeLowerCase(const OUString & aTerm,CharClass const * pCC)842 OUString Hyphenator::makeLowerCase(const OUString& aTerm, CharClass const * pCC)
843 {
844     if (pCC)
845         return pCC->lowercase(aTerm);
846     return aTerm;
847 }
848 
makeUpperCase(const OUString & aTerm,CharClass const * pCC)849 OUString Hyphenator::makeUpperCase(const OUString& aTerm, CharClass const * pCC)
850 {
851     if (pCC)
852         return pCC->uppercase(aTerm);
853     return aTerm;
854 }
855 
makeInitCap(const OUString & aTerm,CharClass const * pCC)856 OUString Hyphenator::makeInitCap(const OUString& aTerm, CharClass const * pCC)
857 {
858     sal_Int32 tlen = aTerm.getLength();
859     if (pCC && tlen)
860     {
861         OUString bTemp = aTerm.copy(0,1);
862         if (tlen > 1)
863             return ( pCC->uppercase(bTemp, 0, 1) + pCC->lowercase(aTerm,1,(tlen-1)) );
864 
865         return pCC->uppercase(bTemp, 0, 1);
866     }
867     return aTerm;
868 }
869 
addLinguServiceEventListener(const Reference<XLinguServiceEventListener> & rxLstnr)870 sal_Bool SAL_CALL Hyphenator::addLinguServiceEventListener(
871         const Reference< XLinguServiceEventListener >& rxLstnr )
872 {
873     MutexGuard  aGuard( GetLinguMutex() );
874 
875     bool bRes = false;
876     if (!bDisposing && rxLstnr.is())
877     {
878         bRes = GetPropHelper().addLinguServiceEventListener( rxLstnr );
879     }
880     return bRes;
881 }
882 
removeLinguServiceEventListener(const Reference<XLinguServiceEventListener> & rxLstnr)883 sal_Bool SAL_CALL Hyphenator::removeLinguServiceEventListener(
884         const Reference< XLinguServiceEventListener >& rxLstnr )
885 {
886     MutexGuard  aGuard( GetLinguMutex() );
887 
888     bool bRes = false;
889     if (!bDisposing && rxLstnr.is())
890     {
891         bRes = GetPropHelper().removeLinguServiceEventListener( rxLstnr );
892     }
893     return bRes;
894 }
895 
getServiceDisplayName(const Locale & rLocale)896 OUString SAL_CALL Hyphenator::getServiceDisplayName(const Locale& rLocale)
897 {
898     std::locale loc(Translate::Create("svt", LanguageTag(rLocale)));
899     return Translate::get(STR_DESCRIPTION_LIBHYPHEN, loc);
900 }
901 
initialize(const Sequence<Any> & rArguments)902 void SAL_CALL Hyphenator::initialize( const Sequence< Any >& rArguments )
903 {
904     MutexGuard  aGuard( GetLinguMutex() );
905 
906     if (pPropHelper)
907         return;
908 
909     sal_Int32 nLen = rArguments.getLength();
910     if (2 == nLen)
911     {
912         Reference< XLinguProperties >   xPropSet;
913         rArguments.getConstArray()[0] >>= xPropSet;
914         // rArguments.getConstArray()[1] >>= xDicList;
915 
916         //! Pointer allows for access of the non-UNO functions.
917         //! And the reference to the UNO-functions while increasing
918         //! the ref-count and will implicitly free the memory
919         //! when the object is no longer used.
920         pPropHelper.reset( new PropertyHelper_Hyphenation( static_cast<XHyphenator *>(this), xPropSet ) );
921         pPropHelper->AddAsPropListener();   //! after a reference is established
922     }
923     else {
924         OSL_FAIL( "wrong number of arguments in sequence" );
925     }
926 }
927 
dispose()928 void SAL_CALL Hyphenator::dispose()
929 {
930     MutexGuard  aGuard( GetLinguMutex() );
931 
932     if (!bDisposing)
933     {
934         bDisposing = true;
935         EventObject aEvtObj( static_cast<XHyphenator *>(this) );
936         aEvtListeners.disposeAndClear( aEvtObj );
937         if (pPropHelper)
938         {
939             pPropHelper->RemoveAsPropListener();
940             pPropHelper.reset();
941         }
942     }
943 }
944 
addEventListener(const Reference<XEventListener> & rxListener)945 void SAL_CALL Hyphenator::addEventListener( const Reference< XEventListener >& rxListener )
946 {
947     MutexGuard  aGuard( GetLinguMutex() );
948 
949     if (!bDisposing && rxListener.is())
950         aEvtListeners.addInterface( rxListener );
951 }
952 
removeEventListener(const Reference<XEventListener> & rxListener)953 void SAL_CALL Hyphenator::removeEventListener( const Reference< XEventListener >& rxListener )
954 {
955     MutexGuard  aGuard( GetLinguMutex() );
956 
957     if (!bDisposing && rxListener.is())
958         aEvtListeners.removeInterface( rxListener );
959 }
960 
961 // Service specific part
getImplementationName()962 OUString SAL_CALL Hyphenator::getImplementationName()
963 {
964     return u"org.openoffice.lingu.LibHnjHyphenator"_ustr;
965 }
966 
supportsService(const OUString & ServiceName)967 sal_Bool SAL_CALL Hyphenator::supportsService( const OUString& ServiceName )
968 {
969     return cppu::supportsService(this, ServiceName);
970 }
971 
getSupportedServiceNames()972 Sequence< OUString > SAL_CALL Hyphenator::getSupportedServiceNames()
973 {
974     return { SN_HYPHENATOR };
975 }
976 
977 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
lingucomponent_Hyphenator_get_implementation(css::uno::XComponentContext *,css::uno::Sequence<css::uno::Any> const &)978 lingucomponent_Hyphenator_get_implementation(
979     css::uno::XComponentContext* , css::uno::Sequence<css::uno::Any> const&)
980 {
981     return cppu::acquire(new Hyphenator());
982 }
983 
984 
985 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
986