1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20 #include <com/sun/star/uno/Reference.h>
21
22 #include <comphelper/sequence.hxx>
23 #include <comphelper/processfactory.hxx>
24 #include <cppuhelper/factory.hxx>
25 #include <cppuhelper/supportsservice.hxx>
26 #include <cppuhelper/weak.hxx>
27 #include <com/sun/star/linguistic2/XLinguProperties.hpp>
28 #include <com/sun/star/linguistic2/LinguServiceManager.hpp>
29 #include <com/sun/star/linguistic2/XSpellChecker1.hpp>
30 #include <i18nlangtag/languagetag.hxx>
31 #include <tools/debug.hxx>
32 #include <osl/mutex.hxx>
33 #include <osl/thread.h>
34
35 #include <hyphen.h>
36 #include "hyphenimp.hxx"
37
38 #include <linguistic/hyphdta.hxx>
39 #include <rtl/ustring.hxx>
40 #include <rtl/ustrbuf.hxx>
41 #include <rtl/textenc.h>
42 #include <sal/log.hxx>
43
44 #include <linguistic/misc.hxx>
45 #include <svtools/strings.hrc>
46 #include <unotools/charclass.hxx>
47 #include <unotools/lingucfg.hxx>
48 #include <unotools/resmgr.hxx>
49 #include <osl/file.hxx>
50
51 #include <stdio.h>
52 #include <string.h>
53
54 #include <cassert>
55 #include <numeric>
56 #include <vector>
57 #include <set>
58 #include <memory>
59 #include <o3tl/string_view.hxx>
60
61 // XML-header to query SPELLML support
62 constexpr OUStringLiteral SPELLML_SUPPORT = u"<?xml?>";
63
64 using namespace osl;
65 using namespace com::sun::star;
66 using namespace com::sun::star::beans;
67 using namespace com::sun::star::lang;
68 using namespace com::sun::star::uno;
69 using namespace com::sun::star::linguistic2;
70 using namespace linguistic;
71
GetLngSvcMgr_Impl()72 static uno::Reference< XLinguServiceManager2 > GetLngSvcMgr_Impl()
73 {
74 uno::Reference< XComponentContext > xContext( comphelper::getProcessComponentContext() );
75 uno::Reference< XLinguServiceManager2 > xRes = LinguServiceManager::create( xContext ) ;
76 return xRes;
77 }
78
Hyphenator()79 Hyphenator::Hyphenator() :
80 aEvtListeners ( GetLinguMutex() )
81 {
82 bDisposing = false;
83 }
84
~Hyphenator()85 Hyphenator::~Hyphenator()
86 {
87 for (auto & rInfo : mvDicts)
88 {
89 if (rInfo.aPtr)
90 hnj_hyphen_free(rInfo.aPtr);
91 }
92
93 if (pPropHelper)
94 {
95 pPropHelper->RemoveAsPropListener();
96 }
97 }
98
GetPropHelper_Impl()99 PropertyHelper_Hyphenation& Hyphenator::GetPropHelper_Impl()
100 {
101 if (!pPropHelper)
102 {
103 Reference< XLinguProperties > xPropSet = GetLinguProperties();
104
105 pPropHelper.reset( new PropertyHelper_Hyphenation (static_cast<XHyphenator *>(this), xPropSet ) );
106 pPropHelper->AddAsPropListener(); //! after a reference is established
107 }
108 return *pPropHelper;
109 }
110
getLocales()111 Sequence< Locale > SAL_CALL Hyphenator::getLocales()
112 {
113 MutexGuard aGuard( GetLinguMutex() );
114
115 // this routine should return the locales supported by the installed
116 // dictionaries.
117 if (mvDicts.empty())
118 {
119 SvtLinguConfig aLinguCfg;
120
121 // get list of dictionaries-to-use
122 // (or better speaking: the list of dictionaries using the
123 // new configuration entries).
124 std::vector< SvtLinguConfigDictionaryEntry > aDics;
125 uno::Sequence< OUString > aFormatList;
126 aLinguCfg.GetSupportedDictionaryFormatsFor( u"Hyphenators"_ustr,
127 u"org.openoffice.lingu.LibHnjHyphenator"_ustr, aFormatList );
128 for (const auto& rFormat : aFormatList)
129 {
130 std::vector< SvtLinguConfigDictionaryEntry > aTmpDic(
131 aLinguCfg.GetActiveDictionariesByFormat( rFormat ) );
132 aDics.insert( aDics.end(), aTmpDic.begin(), aTmpDic.end() );
133 }
134
135 //!! for compatibility with old dictionaries (the ones not using extensions
136 //!! or new configuration entries, but still using the dictionary.lst file)
137 //!! Get the list of old style spell checking dictionaries to use...
138 std::vector< SvtLinguConfigDictionaryEntry > aOldStyleDics(
139 GetOldStyleDics( "HYPH" ) );
140
141 // to prefer dictionaries with configuration entries we will only
142 // use those old style dictionaries that add a language that
143 // is not yet supported by the list of new style dictionaries
144 MergeNewStyleDicsAndOldStyleDics( aDics, aOldStyleDics );
145
146 if (!aDics.empty())
147 {
148 // get supported locales from the dictionaries-to-use...
149 std::set<OUString> aLocaleNamesSet;
150 for (auto const& dict : aDics)
151 {
152 for (const auto& rLocaleName : dict.aLocaleNames)
153 {
154 aLocaleNamesSet.insert( rLocaleName );
155 }
156 }
157 // ... and add them to the resulting sequence
158 std::vector<Locale> aLocalesVec;
159 aLocalesVec.reserve(aLocaleNamesSet.size());
160
161 std::transform(aLocaleNamesSet.begin(), aLocaleNamesSet.end(), std::back_inserter(aLocalesVec),
162 [](const OUString& localeName) { return LanguageTag::convertToLocale(localeName); });
163
164 aSuppLocales = comphelper::containerToSequence(aLocalesVec);
165
166 //! For each dictionary and each locale we need a separate entry.
167 //! If this results in more than one dictionary per locale than (for now)
168 //! it is undefined which dictionary gets used.
169 //! In the future the implementation should support using several dictionaries
170 //! for one locale.
171 sal_Int32 numdict = std::accumulate(aDics.begin(), aDics.end(), 0,
172 [](const sal_Int32 nSum, const SvtLinguConfigDictionaryEntry& dict) {
173 return nSum + dict.aLocaleNames.getLength(); });
174
175 // add dictionary information
176 mvDicts.resize(numdict);
177
178 sal_Int32 k = 0;
179 for (auto const& dict : aDics)
180 {
181 if (dict.aLocaleNames.hasElements() &&
182 dict.aLocations.hasElements())
183 {
184 // currently only one language per dictionary is supported in the actual implementation...
185 // Thus here we work-around this by adding the same dictionary several times.
186 // Once for each of its supported locales.
187 for (const auto& rLocaleName : dict.aLocaleNames)
188 {
189 LanguageTag aLanguageTag(rLocaleName);
190 mvDicts[k].aPtr = nullptr;
191 mvDicts[k].eEnc = RTL_TEXTENCODING_DONTKNOW;
192 mvDicts[k].aLoc = aLanguageTag.getLocale();
193 mvDicts[k].apCC.reset( new CharClass( std::move(aLanguageTag) ) );
194 // also both files have to be in the same directory and the
195 // file names must only differ in the extension (.aff/.dic).
196 // Thus we use the first location only and strip the extension part.
197 OUString aLocation = dict.aLocations[0];
198 sal_Int32 nPos = aLocation.lastIndexOf( '.' );
199 aLocation = aLocation.copy( 0, nPos );
200 mvDicts[k].aName = aLocation;
201
202 ++k;
203 }
204 }
205 }
206 DBG_ASSERT( k == numdict, "index mismatch?" );
207 }
208 else
209 {
210 // no dictionary found so register no dictionaries
211 mvDicts.clear();
212 aSuppLocales.realloc(0);
213 }
214 }
215
216 return aSuppLocales;
217 }
218
hasLocale(const Locale & rLocale)219 sal_Bool SAL_CALL Hyphenator::hasLocale(const Locale& rLocale)
220 {
221 MutexGuard aGuard( GetLinguMutex() );
222
223 if (!aSuppLocales.hasElements())
224 getLocales();
225
226 return comphelper::findValue(aSuppLocales, rLocale) != -1;
227 }
228
229 namespace {
LoadDictionary(HDInfo & rDict)230 bool LoadDictionary(HDInfo& rDict)
231 {
232 OUString DictFN = rDict.aName + ".dic";
233 OUString dictpath;
234
235 osl::FileBase::getSystemPathFromFileURL(DictFN, dictpath);
236
237 #if defined(_WIN32)
238 // hnj_hyphen_load expects UTF-8 encoded paths with \\?\ long path prefix.
239 OString sTmp = Win_AddLongPathPrefix(OUStringToOString(dictpath, RTL_TEXTENCODING_UTF8));
240 #else
241 OString sTmp(OU2ENC(dictpath, osl_getThreadTextEncoding()));
242 #endif
243 HyphenDict *dict = nullptr;
244 if ((dict = hnj_hyphen_load(sTmp.getStr())) == nullptr)
245 {
246 SAL_WARN(
247 "lingucomponent",
248 "Couldn't find file " << dictpath);
249 return false;
250 }
251 rDict.aPtr = dict;
252 rDict.eEnc = getTextEncodingFromCharset(dict->cset);
253 return true;
254 }
255 }
256
hyphenate(const OUString & aWord,const css::lang::Locale & aLocale,sal_Int16 nMaxLeading,const css::uno::Sequence<css::beans::PropertyValue> & aProperties)257 Reference< XHyphenatedWord > SAL_CALL Hyphenator::hyphenate( const OUString& aWord,
258 const css::lang::Locale& aLocale,
259 sal_Int16 nMaxLeading,
260 const css::uno::Sequence< css::beans::PropertyValue >& aProperties )
261 {
262 PropertyHelper_Hyphenation& rHelper = GetPropHelper();
263 rHelper.SetTmpPropVals(aProperties);
264 sal_Int16 minTrail = rHelper.GetMinTrailing();
265 sal_Int16 minLead = rHelper.GetMinLeading();
266 sal_Int16 minCompoundLead = rHelper.GetCompoundMinLeading();
267 sal_Int16 minLen = rHelper.GetMinWordLength();
268 bool bNoHyphenateCaps = rHelper.IsNoHyphenateCaps();
269
270 rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
271
272 Reference< XHyphenatedWord > xRes;
273
274 int k = -1;
275 for (size_t j = 0; j < mvDicts.size(); ++j)
276 {
277 if (aLocale == mvDicts[j].aLoc)
278 k = j;
279 }
280
281 // if we have a hyphenation dictionary matching this locale
282 if (k != -1)
283 {
284 int nHyphenationPos = -1;
285 int nHyphenationPosAlt = -1;
286 int nHyphenationPosAltHyph = -1;
287
288 // if this dictionary has not been loaded yet do that
289 if (!mvDicts[k].aPtr)
290 {
291 if (!LoadDictionary(mvDicts[k]))
292 return nullptr;
293 }
294
295 // otherwise hyphenate the word with that dictionary
296 HyphenDict *dict = mvDicts[k].aPtr;
297 eEnc = mvDicts[k].eEnc;
298 CharClass * pCC = mvDicts[k].apCC.get();
299
300 // Don't hyphenate uppercase words if requested
301 if (bNoHyphenateCaps && aWord == makeUpperCase(aWord, pCC))
302 {
303 return nullptr;
304 }
305
306 // we don't want to work with a default text encoding since following incorrect
307 // results may occur only for specific text and thus may be hard to notice.
308 // Thus better always make a clean exit here if the text encoding is in question.
309 // Hopefully something not working at all will raise proper attention quickly. ;-)
310 DBG_ASSERT( eEnc != RTL_TEXTENCODING_DONTKNOW, "failed to get text encoding! (maybe incorrect encoding string in file)" );
311 if (eEnc == RTL_TEXTENCODING_DONTKNOW)
312 return nullptr;
313
314 CapType ct = capitalType(aWord, pCC);
315
316 // first convert any smart quotes or apostrophes to normal ones
317 OUStringBuffer rBuf(aWord);
318 sal_Int32 nc = rBuf.getLength();
319 sal_Unicode ch;
320 for (sal_Int32 ix=0; ix < nc; ix++)
321 {
322 ch = rBuf[ix];
323 if ((ch == 0x201C) || (ch == 0x201D))
324 rBuf[ix] = u'"';
325 if ((ch == 0x2018) || (ch == 0x2019))
326 rBuf[ix] = u'\'';
327 }
328 OUString nWord(rBuf.makeStringAndClear());
329
330 // now convert word to all lowercase for pattern recognition
331 OUString nTerm(makeLowerCase(nWord, pCC));
332
333 // now convert word to needed encoding
334 OString encWord(OU2ENC(nTerm,eEnc));
335
336 int wordlen = encWord.getLength();
337 std::unique_ptr<char[]> lcword(new char[wordlen + 1]);
338 std::unique_ptr<char[]> hyphens(new char[wordlen + 5]);
339
340 char ** rep = nullptr; // replacements of discretionary hyphenation
341 int * pos = nullptr; // array of [hyphenation point] minus [deletion position]
342 int * cut = nullptr; // length of deletions in original word
343
344 // copy converted word into simple char buffer
345 strcpy(lcword.get(),encWord.getStr());
346
347 // now strip off any ending periods
348 int n = wordlen-1;
349 while((n >=0) && (lcword[n] == '.'))
350 n--;
351 n++;
352 if (n > 0)
353 {
354 const bool bFailed = 0 != hnj_hyphen_hyphenate3( dict, lcword.get(), n, hyphens.get(), nullptr,
355 &rep, &pos, &cut, minLead, minTrail,
356 std::max<sal_Int16>(dict->clhmin, std::max<sal_Int16>(dict->clhmin, 2) + std::max(0, minLead - std::max<sal_Int16>(dict->lhmin, 2))),
357 std::max<sal_Int16>(dict->crhmin, std::max<sal_Int16>(dict->crhmin, 2) + std::max(0, minTrail - std::max<sal_Int16>(dict->rhmin, 2))) );
358 if (bFailed)
359 {
360 // whoops something did not work
361 if (rep)
362 {
363 for(int j = 0; j < n; j++)
364 {
365 if (rep[j]) free(rep[j]);
366 }
367 free(rep);
368 }
369 if (pos) free(pos);
370 if (cut) free(cut);
371 return nullptr;
372 }
373 }
374
375 // now backfill hyphens[] for any removed trailing periods
376 for (int c = n; c < wordlen; c++) hyphens[c] = '0';
377 hyphens[wordlen] = '\0';
378
379 sal_Int32 Leading = GetPosInWordToCheck( aWord, nMaxLeading );
380
381 // use morphological analysis of Hunspell to get better hyphenation of compound words
382 // optionally when hyphenation zone is enabled
383 // pa: fields contain stems resulted by compound word analysis of non-dictionary words
384 // hy: fields contain hyphenation data of dictionary (compound) words
385 Reference< XSpellAlternatives > xTmpRes;
386 bool bAnalyzed = false; // enough the analyse once the word
387 bool bCompoundHyphenation = true; // try to hyphenate compound words better
388 OUString sStems; // processed result of the compound word analysis, e.g. com|pound|word
389 sal_Int32 nSuffixLen = 0; // do not remove break points in suffixes
390
391 for (sal_Int32 i = 0; i < n; i++)
392 {
393 int leftrep = 0;
394 bool hit = (n >= minLen);
395 if (!rep || !rep[i])
396 {
397 hit = hit && (hyphens[i]&1) && (i < Leading);
398 hit = hit && (i >= (minLead-1) );
399 hit = hit && ((n - i - 1) >= minTrail);
400 }
401 else
402 {
403 // calculate change character length before hyphenation point signed with '='
404 for (char * c = rep[i]; *c && (*c != '='); c++)
405 {
406 if (eEnc == RTL_TEXTENCODING_UTF8)
407 {
408 if (static_cast<unsigned char>(*c) >> 6 != 2)
409 leftrep++;
410 }
411 else
412 leftrep++;
413 }
414 hit = hit && (hyphens[i]&1) && ((i + leftrep - pos[i]) < Leading);
415 hit = hit && ((i + leftrep - pos[i]) >= (minLead-1) );
416 hit = hit && ((n - i - 1 + sal::static_int_cast< sal_sSize >(strlen(rep[i])) - leftrep - 1) >= minTrail);
417 }
418 if (hit)
419 {
420 // skip hyphenation right after stem boundaries in compound words
421 // if minCompoundLead > 2 (default value: less than n=minCompoundLead character distance)
422 if ( bCompoundHyphenation && minCompoundLead > 2 && nHyphenationPos > -1 && i - nHyphenationPos < minCompoundLead )
423 {
424 uno::Reference< XLinguServiceManager2 > xLngSvcMgr( GetLngSvcMgr_Impl() );
425 uno::Reference< XSpellChecker1 > xSpell;
426
427 LanguageType nLanguage = LinguLocaleToLanguage( aLocale );
428
429 xSpell.set( xLngSvcMgr->getSpellChecker(), UNO_QUERY );
430
431 // get morphological analysis of the word
432 if ( ( bAnalyzed && xTmpRes.is() ) || ( xSpell.is() && xSpell->isValid(
433 SPELLML_SUPPORT, static_cast<sal_uInt16>(nLanguage),
434 uno::Sequence< beans::PropertyValue >() ) ) )
435 {
436 if ( !bAnalyzed )
437 {
438 xTmpRes = xSpell->spell( "<?xml?><query type='analyze'><word>" +
439 aWord + "</word></query>",
440 static_cast<sal_uInt16>(nLanguage),
441 uno::Sequence< beans::PropertyValue >() );
442 bAnalyzed = true;
443
444 if (xTmpRes.is())
445 {
446 Sequence<OUString>seq = xTmpRes->getAlternatives();
447 if (seq.hasElements())
448 {
449 sal_Int32 nEndOfFirstAnalysis = seq[0].indexOf("</a>");
450 // FIXME use only the first analysis
451 OUString morph(
452 seq[0].copy(0, nEndOfFirstAnalysis));
453
454 // concatenate pa: fields, i.e. stems in the analysis:
455 // pa:stem1 pa:stem2 pa:stem3 -> stem1||stem2||stem3
456 sal_Int32 nPa = -1;
457 while ( (nPa = morph.indexOf(u" pa:", nPa + 1)) > -1 )
458 {
459 // use hy: field of the actual stem, if it exists
460 // pa:stem1 hy:st|em1 pa:stem2 -> st|em1||stem2
461 sal_Int32 nHy = morph.indexOf(u" hy:", nPa + 3);
462 sal_Int32 nPa2 = morph.indexOf(u" pa:", nPa + 3);
463
464 if ( nHy > -1 && ( nPa2 == -1 || nHy < nPa2 ) )
465 {
466 OUString sStems2(morph.getToken(1, ' ', nHy).copy(3));
467 if ( sStems2.indexOf('|') > -1 )
468 sStems += sStems2+ u"||";
469 else if ( sal_Int32 nBreak = o3tl::toInt32(sStems2) )
470 {
471 OUString sPa(morph.getToken(1, ' ', nPa).copy(3));
472 if ( nBreak < sPa.getLength() )
473 sStems += OUString::Concat(sPa.subView(0, nBreak)) + u"|" +
474 sPa.subView(nBreak);
475 }
476 }
477 else
478 {
479 OUString sPa(morph.getToken(1, ' ', nPa).copy(3));
480
481 // handle special case: missing pa: in morphological analysis
482 // before in-word suffixes (German, Sweden etc. dictionaries)
483 // (recognized by the single last pa:)
484 if (sStems.isEmpty() && nPa2 == -1 && aWord.endsWith(sPa))
485 {
486 sStems = OUString::Concat(aWord.subView(0, aWord.getLength() -
487 sPa.getLength())) + u"||" +
488 aWord.subView(aWord.getLength() -
489 sPa.getLength());
490 break;
491 }
492
493 sStems += sPa + "||";
494
495 // count suffix length
496 sal_Int32 nSt = morph.lastIndexOf(" st:");
497 if ( nSt > -1 )
498 {
499 sal_Int32 nStemLen =
500 o3tl::getToken(morph, 1, ' ', nSt).length() - 3;
501 if ( nStemLen < sPa.getLength() )
502 nSuffixLen = sPa.getLength() - nStemLen;
503 }
504 }
505
506 if ( nPa == -1 ) // getToken() can modify nPa
507 break;
508 }
509
510 // only hy:, but not pa:
511 if ( sStems.isEmpty() )
512 {
513 // check hy: (pre-defined hyphenation)
514 sal_Int32 nHy = morph.indexOf(" hy:");
515 if (nHy > -1)
516 {
517 sStems = morph.getToken(1, ' ', nHy).copy(3);
518 if ( sStems.indexOf('|') == -1 && sStems.indexOf('-') == -1 )
519 {
520 if ( sal_Int32 nBreak = o3tl::toInt32(sStems) )
521 {
522 if ( nBreak < aWord.getLength() )
523 sStems += OUString::Concat(aWord.subView(0, nBreak)) + u"|" +
524 aWord.subView(nBreak);
525 }
526 }
527 }
528 }
529 }
530 }
531 }
532
533 // handle string separated by |, e.g "program hy:pro|gram"
534 if ( sStems.indexOf('|') > -1 )
535 {
536 sal_Int32 nLetters = 0; // count not separator characters
537 sal_Int32 nSepPos = -1; // position of last character | used for stem boundaries
538 bool bWeightedSep = false; // double separator || = weighted stem boundary
539 sal_Int32 j = 0;
540 for (; j < sStems.getLength() && nLetters <= i; j++)
541 {
542 if ( sStems[j] == '|' )
543 {
544 bWeightedSep = nSepPos > -1 && (j - 1 == nSepPos);
545 nSepPos = j;
546 }
547 else if ( sStems[j] != '-' && sStems[j] != '=' && sStems[j] != '*' )
548 ++nLetters;
549 }
550 // skip break points near stem boundaries
551 if (
552 // there is a stem boundary before the actual break point
553 nSepPos > -1 &&
554 // and the break point is within a stem, i.e. not in the
555 // suffix of the last stem
556 i < aWord.getLength() - nSuffixLen - 1 &&
557 // and it is not another stem boundary
558 j + 1 < sStems.getLength() &&
559 ( sStems[j + 1] != u'|' ||
560 // except if it's only the previous was a weighted one
561 ( bWeightedSep && ( j + 2 == sStems.getLength() ||
562 sStems[j + 2] != u'|' ) ) ) )
563 {
564 continue;
565 }
566 }
567 else
568 // not a compound word
569 bCompoundHyphenation = false;
570 }
571 else
572 // no SPELLML support, no morphological analysis
573 bCompoundHyphenation = false;
574 }
575
576 nHyphenationPos = i;
577 if (rep && rep[i])
578 {
579 nHyphenationPosAlt = i - pos[i];
580 nHyphenationPosAltHyph = i + leftrep - pos[i];
581 }
582 }
583 }
584
585 if (nHyphenationPos == -1)
586 {
587 xRes = nullptr;
588 }
589 else
590 {
591 if (rep && rep[nHyphenationPos])
592 {
593 // remove equal sign
594 char * s = rep[nHyphenationPos];
595 int eq = 0;
596 for (; *s; s++)
597 {
598 if (*s == '=') eq = 1;
599 if (eq) *s = *(s + 1);
600 }
601 OUString repHyphlow(rep[nHyphenationPos], strlen(rep[nHyphenationPos]), eEnc);
602 OUString repHyph;
603 switch (ct)
604 {
605 case CapType::ALLCAP:
606 {
607 repHyph = makeUpperCase(repHyphlow, pCC);
608 break;
609 }
610 case CapType::INITCAP:
611 {
612 if (nHyphenationPosAlt == -1)
613 repHyph = makeInitCap(repHyphlow, pCC);
614 else
615 repHyph = repHyphlow;
616 break;
617 }
618 default:
619 {
620 repHyph = repHyphlow;
621 break;
622 }
623 }
624
625 // handle shortening
626 sal_Int16 nPos = static_cast<sal_Int16>((nHyphenationPosAltHyph < nHyphenationPos) ?
627 nHyphenationPosAltHyph : nHyphenationPos);
628 // discretionary hyphenation
629 xRes = HyphenatedWord::CreateHyphenatedWord( aWord, LinguLocaleToLanguage( aLocale ), nPos,
630 aWord.replaceAt(nHyphenationPosAlt + 1, cut[nHyphenationPos], repHyph),
631 static_cast<sal_Int16>(nHyphenationPosAltHyph));
632 }
633 else
634 {
635 xRes = HyphenatedWord::CreateHyphenatedWord( aWord, LinguLocaleToLanguage( aLocale ),
636 static_cast<sal_Int16>(nHyphenationPos), aWord, static_cast<sal_Int16>(nHyphenationPos));
637 }
638 }
639
640 if (rep)
641 {
642 for(int j = 0; j < n; j++)
643 {
644 if (rep[j]) free(rep[j]);
645 }
646 free(rep);
647 }
648 if (pos) free(pos);
649 if (cut) free(cut);
650 return xRes;
651 }
652 return nullptr;
653 }
654
queryAlternativeSpelling(const OUString & aWord,const css::lang::Locale & aLocale,sal_Int16 nIndex,const css::uno::Sequence<css::beans::PropertyValue> & aProperties)655 Reference < XHyphenatedWord > SAL_CALL Hyphenator::queryAlternativeSpelling(
656 const OUString& aWord,
657 const css::lang::Locale& aLocale,
658 sal_Int16 nIndex,
659 const css::uno::Sequence< css::beans::PropertyValue >& aProperties )
660 {
661 // Firstly we allow only one plus character before the hyphen to avoid to miss the right break point:
662 for (int extrachar = 1; extrachar <= 2; extrachar++)
663 {
664 Reference< XHyphenatedWord > xRes = hyphenate(aWord, aLocale, nIndex + 1 + extrachar, aProperties);
665 if (xRes.is() && xRes->isAlternativeSpelling() && xRes->getHyphenationPos() == nIndex)
666 return xRes;
667 }
668 return nullptr;
669 }
670
createPossibleHyphens(const OUString & aWord,const css::lang::Locale & aLocale,const css::uno::Sequence<css::beans::PropertyValue> & aProperties)671 Reference< XPossibleHyphens > SAL_CALL Hyphenator::createPossibleHyphens( const OUString& aWord,
672 const css::lang::Locale& aLocale,
673 const css::uno::Sequence< css::beans::PropertyValue >& aProperties )
674 {
675 PropertyHelper_Hyphenation& rHelper = GetPropHelper();
676 rHelper.SetTmpPropVals(aProperties);
677 sal_Int16 minTrail = rHelper.GetMinTrailing();
678 sal_Int16 minLead = rHelper.GetMinLeading();
679 sal_Int16 minLen = rHelper.GetMinWordLength();
680
681 // Resolves: fdo#41083 honour MinWordLength in "createPossibleHyphens" as
682 // well as "hyphenate"
683 if (aWord.getLength() < minLen)
684 {
685 return PossibleHyphens::CreatePossibleHyphens( aWord, LinguLocaleToLanguage( aLocale ),
686 aWord, Sequence< sal_Int16 >() );
687 }
688
689 int k = -1;
690 for (size_t j = 0; j < mvDicts.size(); ++j)
691 {
692 if (aLocale == mvDicts[j].aLoc)
693 k = j;
694 }
695
696 // if we have a hyphenation dictionary matching this locale
697 if (k != -1)
698 {
699 HyphenDict *dict = nullptr;
700 // if this dictionary has not been loaded yet do that
701 if (!mvDicts[k].aPtr)
702 {
703 if (!LoadDictionary(mvDicts[k]))
704 return nullptr;
705 }
706
707 // otherwise hyphenate the word with that dictionary
708 dict = mvDicts[k].aPtr;
709 rtl_TextEncoding eEnc = mvDicts[k].eEnc;
710 CharClass* pCC = mvDicts[k].apCC.get();
711
712 // we don't want to work with a default text encoding since following incorrect
713 // results may occur only for specific text and thus may be hard to notice.
714 // Thus better always make a clean exit here if the text encoding is in question.
715 // Hopefully something not working at all will raise proper attention quickly. ;-)
716 DBG_ASSERT( eEnc != RTL_TEXTENCODING_DONTKNOW, "failed to get text encoding! (maybe incorrect encoding string in file)" );
717 if (eEnc == RTL_TEXTENCODING_DONTKNOW)
718 return nullptr;
719
720 // first handle smart quotes both single and double
721 OUStringBuffer rBuf(aWord);
722 sal_Int32 nc = rBuf.getLength();
723 sal_Unicode ch;
724 for (sal_Int32 ix=0; ix < nc; ix++)
725 {
726 ch = rBuf[ix];
727 if ((ch == 0x201C) || (ch == 0x201D))
728 rBuf[ix] = u'"';
729 if ((ch == 0x2018) || (ch == 0x2019))
730 rBuf[ix] = u'\'';
731 }
732 OUString nWord(rBuf.makeStringAndClear());
733
734 // now convert word to all lowercase for pattern recognition
735 OUString nTerm(makeLowerCase(nWord, pCC));
736
737 // now convert word to needed encoding
738 OString encWord(OU2ENC(nTerm,eEnc));
739
740 sal_Int32 wordlen = encWord.getLength();
741 std::unique_ptr<char[]> lcword(new char[wordlen+1]);
742 std::unique_ptr<char[]> hyphens(new char[wordlen+5]);
743 char ** rep = nullptr; // replacements of discretionary hyphenation
744 int * pos = nullptr; // array of [hyphenation point] minus [deletion position]
745 int * cut = nullptr; // length of deletions in original word
746
747 // copy converted word into simple char buffer
748 strcpy(lcword.get(),encWord.getStr());
749
750 // first remove any trailing periods
751 sal_Int32 n = wordlen-1;
752 while((n >=0) && (lcword[n] == '.'))
753 n--;
754 n++;
755 if (n > 0)
756 {
757 const bool bFailed = 0 != hnj_hyphen_hyphenate3(dict, lcword.get(), n, hyphens.get(), nullptr,
758 &rep, &pos, &cut, minLead, minTrail,
759 std::max<sal_Int16>(dict->clhmin, std::max<sal_Int16>(dict->clhmin, 2) + std::max(0, minLead - std::max<sal_Int16>(dict->lhmin, 2))),
760 std::max<sal_Int16>(dict->crhmin, std::max<sal_Int16>(dict->crhmin, 2) + std::max(0, minTrail - std::max<sal_Int16>(dict->rhmin, 2))) );
761 if (bFailed)
762 {
763 if (rep)
764 {
765 for(int j = 0; j < n; j++)
766 {
767 if (rep[j]) free(rep[j]);
768 }
769 free(rep);
770 }
771 if (pos) free(pos);
772 if (cut) free(cut);
773
774 return nullptr;
775 }
776 }
777 // now backfill hyphens[] for any removed periods
778 for (sal_Int32 c = n; c < wordlen; c++)
779 hyphens[c] = '0';
780 hyphens[wordlen] = '\0';
781
782 sal_Int32 nHyphCount = 0;
783
784 for ( sal_Int32 i = 0; i < encWord.getLength(); i++)
785 {
786 if (hyphens[i]&1)
787 nHyphCount++;
788 }
789
790 Sequence< sal_Int16 > aHyphPos(nHyphCount);
791 sal_Int16 *pPos = aHyphPos.getArray();
792 OUStringBuffer hyphenatedWordBuffer;
793 nHyphCount = 0;
794
795 for (sal_Int32 i = 0; i < nWord.getLength(); i++)
796 {
797 hyphenatedWordBuffer.append(aWord[i]);
798 // hyphenation position
799 if (hyphens[i]&1)
800 {
801 // linguistic::PossibleHyphens is stuck with
802 // css::uno::Sequence<sal_Int16> because of
803 // css.linguistic2.XPossibleHyphens.getHyphenationPositions, so
804 // any further positions need to be ignored:
805 assert(i >= SAL_MIN_INT16);
806 if (i > SAL_MAX_INT16)
807 {
808 SAL_WARN(
809 "lingucomponent",
810 "hyphen pos " << i << " > SAL_MAX_INT16 in \"" << aWord
811 << "\"");
812 continue;
813 }
814 pPos[nHyphCount] = i;
815 hyphenatedWordBuffer.append('=');
816 nHyphCount++;
817 }
818 }
819
820 OUString hyphenatedWord = hyphenatedWordBuffer.makeStringAndClear();
821
822 Reference< XPossibleHyphens > xRes = PossibleHyphens::CreatePossibleHyphens(
823 aWord, LinguLocaleToLanguage( aLocale ), hyphenatedWord, aHyphPos);
824
825 if (rep)
826 {
827 for(int j = 0; j < n; j++)
828 {
829 if (rep[j]) free(rep[j]);
830 }
831 free(rep);
832 }
833 if (pos) free(pos);
834 if (cut) free(cut);
835
836 return xRes;
837 }
838
839 return nullptr;
840 }
841
makeLowerCase(const OUString & aTerm,CharClass const * pCC)842 OUString Hyphenator::makeLowerCase(const OUString& aTerm, CharClass const * pCC)
843 {
844 if (pCC)
845 return pCC->lowercase(aTerm);
846 return aTerm;
847 }
848
makeUpperCase(const OUString & aTerm,CharClass const * pCC)849 OUString Hyphenator::makeUpperCase(const OUString& aTerm, CharClass const * pCC)
850 {
851 if (pCC)
852 return pCC->uppercase(aTerm);
853 return aTerm;
854 }
855
makeInitCap(const OUString & aTerm,CharClass const * pCC)856 OUString Hyphenator::makeInitCap(const OUString& aTerm, CharClass const * pCC)
857 {
858 sal_Int32 tlen = aTerm.getLength();
859 if (pCC && tlen)
860 {
861 OUString bTemp = aTerm.copy(0,1);
862 if (tlen > 1)
863 return ( pCC->uppercase(bTemp, 0, 1) + pCC->lowercase(aTerm,1,(tlen-1)) );
864
865 return pCC->uppercase(bTemp, 0, 1);
866 }
867 return aTerm;
868 }
869
addLinguServiceEventListener(const Reference<XLinguServiceEventListener> & rxLstnr)870 sal_Bool SAL_CALL Hyphenator::addLinguServiceEventListener(
871 const Reference< XLinguServiceEventListener >& rxLstnr )
872 {
873 MutexGuard aGuard( GetLinguMutex() );
874
875 bool bRes = false;
876 if (!bDisposing && rxLstnr.is())
877 {
878 bRes = GetPropHelper().addLinguServiceEventListener( rxLstnr );
879 }
880 return bRes;
881 }
882
removeLinguServiceEventListener(const Reference<XLinguServiceEventListener> & rxLstnr)883 sal_Bool SAL_CALL Hyphenator::removeLinguServiceEventListener(
884 const Reference< XLinguServiceEventListener >& rxLstnr )
885 {
886 MutexGuard aGuard( GetLinguMutex() );
887
888 bool bRes = false;
889 if (!bDisposing && rxLstnr.is())
890 {
891 bRes = GetPropHelper().removeLinguServiceEventListener( rxLstnr );
892 }
893 return bRes;
894 }
895
getServiceDisplayName(const Locale & rLocale)896 OUString SAL_CALL Hyphenator::getServiceDisplayName(const Locale& rLocale)
897 {
898 std::locale loc(Translate::Create("svt", LanguageTag(rLocale)));
899 return Translate::get(STR_DESCRIPTION_LIBHYPHEN, loc);
900 }
901
initialize(const Sequence<Any> & rArguments)902 void SAL_CALL Hyphenator::initialize( const Sequence< Any >& rArguments )
903 {
904 MutexGuard aGuard( GetLinguMutex() );
905
906 if (pPropHelper)
907 return;
908
909 sal_Int32 nLen = rArguments.getLength();
910 if (2 == nLen)
911 {
912 Reference< XLinguProperties > xPropSet;
913 rArguments.getConstArray()[0] >>= xPropSet;
914 // rArguments.getConstArray()[1] >>= xDicList;
915
916 //! Pointer allows for access of the non-UNO functions.
917 //! And the reference to the UNO-functions while increasing
918 //! the ref-count and will implicitly free the memory
919 //! when the object is no longer used.
920 pPropHelper.reset( new PropertyHelper_Hyphenation( static_cast<XHyphenator *>(this), xPropSet ) );
921 pPropHelper->AddAsPropListener(); //! after a reference is established
922 }
923 else {
924 OSL_FAIL( "wrong number of arguments in sequence" );
925 }
926 }
927
dispose()928 void SAL_CALL Hyphenator::dispose()
929 {
930 MutexGuard aGuard( GetLinguMutex() );
931
932 if (!bDisposing)
933 {
934 bDisposing = true;
935 EventObject aEvtObj( static_cast<XHyphenator *>(this) );
936 aEvtListeners.disposeAndClear( aEvtObj );
937 if (pPropHelper)
938 {
939 pPropHelper->RemoveAsPropListener();
940 pPropHelper.reset();
941 }
942 }
943 }
944
addEventListener(const Reference<XEventListener> & rxListener)945 void SAL_CALL Hyphenator::addEventListener( const Reference< XEventListener >& rxListener )
946 {
947 MutexGuard aGuard( GetLinguMutex() );
948
949 if (!bDisposing && rxListener.is())
950 aEvtListeners.addInterface( rxListener );
951 }
952
removeEventListener(const Reference<XEventListener> & rxListener)953 void SAL_CALL Hyphenator::removeEventListener( const Reference< XEventListener >& rxListener )
954 {
955 MutexGuard aGuard( GetLinguMutex() );
956
957 if (!bDisposing && rxListener.is())
958 aEvtListeners.removeInterface( rxListener );
959 }
960
961 // Service specific part
getImplementationName()962 OUString SAL_CALL Hyphenator::getImplementationName()
963 {
964 return u"org.openoffice.lingu.LibHnjHyphenator"_ustr;
965 }
966
supportsService(const OUString & ServiceName)967 sal_Bool SAL_CALL Hyphenator::supportsService( const OUString& ServiceName )
968 {
969 return cppu::supportsService(this, ServiceName);
970 }
971
getSupportedServiceNames()972 Sequence< OUString > SAL_CALL Hyphenator::getSupportedServiceNames()
973 {
974 return { SN_HYPHENATOR };
975 }
976
977 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
lingucomponent_Hyphenator_get_implementation(css::uno::XComponentContext *,css::uno::Sequence<css::uno::Any> const &)978 lingucomponent_Hyphenator_get_implementation(
979 css::uno::XComponentContext* , css::uno::Sequence<css::uno::Any> const&)
980 {
981 return cppu::acquire(new Hyphenator());
982 }
983
984
985 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
986