xref: /core/svtools/source/svhtml/parhtml.cxx (revision b8237d35)
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <comphelper/string.hxx>
21 #include <o3tl/safeint.hxx>
22 #include <o3tl/string_view.hxx>
23 #include <tools/stream.hxx>
24 #include <tools/debug.hxx>
25 #include <tools/color.hxx>
26 #include <rtl/ustrbuf.hxx>
27 #include <rtl/character.hxx>
28 #include <rtl/tencinfo.h>
29 #include <sal/log.hxx>
30 #include <tools/tenccvt.hxx>
31 #include <tools/datetime.hxx>
32 #include <unotools/datetime.hxx>
33 #include <svl/inettype.hxx>
34 #include <svl/lngmisc.hxx>
35 #include <com/sun/star/beans/PropertyAttribute.hpp>
36 #include <com/sun/star/document/XDocumentProperties.hpp>
37 
38 #include <svtools/parhtml.hxx>
39 #include <svtools/htmltokn.h>
40 #include <svtools/htmlkywd.hxx>
41 
42 #include <utility>
43 
44 using namespace ::com::sun::star;
45 
46 
47 const sal_Int32 MAX_LEN( 1024 );
48 
49 const sal_Int32 MAX_ENTITY_LEN( 8 );
50 
51 
52 // Tables to convert option values into strings
53 
54 // <INPUT TYPE=xxx>
55 HTMLOptionEnum<HTMLInputType> const aInputTypeOptEnums[] =
56 {
57     { OOO_STRING_SVTOOLS_HTML_IT_text,      HTMLInputType::Text        },
58     { OOO_STRING_SVTOOLS_HTML_IT_password,  HTMLInputType::Password    },
59     { OOO_STRING_SVTOOLS_HTML_IT_checkbox,  HTMLInputType::Checkbox    },
60     { OOO_STRING_SVTOOLS_HTML_IT_radio,     HTMLInputType::Radio       },
61     { OOO_STRING_SVTOOLS_HTML_IT_range,     HTMLInputType::Range       },
62     { OOO_STRING_SVTOOLS_HTML_IT_scribble,  HTMLInputType::Scribble    },
63     { OOO_STRING_SVTOOLS_HTML_IT_file,      HTMLInputType::File        },
64     { OOO_STRING_SVTOOLS_HTML_IT_hidden,    HTMLInputType::Hidden      },
65     { OOO_STRING_SVTOOLS_HTML_IT_submit,    HTMLInputType::Submit      },
66     { OOO_STRING_SVTOOLS_HTML_IT_image,     HTMLInputType::Image       },
67     { OOO_STRING_SVTOOLS_HTML_IT_reset,     HTMLInputType::Reset       },
68     { OOO_STRING_SVTOOLS_HTML_IT_button,    HTMLInputType::Button      },
69     { nullptr,                              HTMLInputType(0)    }
70 };
71 
72 // <TABLE FRAME=xxx>
73 HTMLOptionEnum<HTMLTableFrame> const aTableFrameOptEnums[] =
74 {
75     { OOO_STRING_SVTOOLS_HTML_TF_void,    HTMLTableFrame::Void    },
76     { OOO_STRING_SVTOOLS_HTML_TF_above,   HTMLTableFrame::Above   },
77     { OOO_STRING_SVTOOLS_HTML_TF_below,   HTMLTableFrame::Below   },
78     { OOO_STRING_SVTOOLS_HTML_TF_hsides,  HTMLTableFrame::HSides  },
79     { OOO_STRING_SVTOOLS_HTML_TF_lhs,     HTMLTableFrame::LHS     },
80     { OOO_STRING_SVTOOLS_HTML_TF_rhs,     HTMLTableFrame::RHS     },
81     { OOO_STRING_SVTOOLS_HTML_TF_vsides,  HTMLTableFrame::VSides  },
82     { OOO_STRING_SVTOOLS_HTML_TF_box,     HTMLTableFrame::Box     },
83     { OOO_STRING_SVTOOLS_HTML_TF_border,  HTMLTableFrame::Box     },
84     { nullptr,                            HTMLTableFrame(0) }
85 };
86 
87 // <TABLE RULES=xxx>
88 HTMLOptionEnum<HTMLTableRules> const aTableRulesOptEnums[] =
89 {
90     { OOO_STRING_SVTOOLS_HTML_TR_none,   HTMLTableRules::NONE      },
91     { OOO_STRING_SVTOOLS_HTML_TR_groups, HTMLTableRules::Groups    },
92     { OOO_STRING_SVTOOLS_HTML_TR_rows,   HTMLTableRules::Rows      },
93     { OOO_STRING_SVTOOLS_HTML_TR_cols,   HTMLTableRules::Cols      },
94     { OOO_STRING_SVTOOLS_HTML_TR_all,    HTMLTableRules::All       },
95     { nullptr,                           HTMLTableRules(0) }
96 };
97 
98 
HTMLOption(HtmlOptionId nTok,OUString _aToken,OUString _aValue)99 HTMLOption::HTMLOption( HtmlOptionId nTok, OUString _aToken,
100                         OUString _aValue )
101     : aValue(std::move(_aValue))
102     , aToken(std::move(_aToken))
103     , nToken( nTok )
104 {
105     DBG_ASSERT( nToken>=HtmlOptionId::BOOL_START && nToken<HtmlOptionId::END,
106         "HTMLOption: unknown token" );
107 }
108 
GetNumber() const109 sal_uInt32 HTMLOption::GetNumber() const
110 {
111     DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START &&
112                  nToken<HtmlOptionId::NUMBER_END) ||
113                 (nToken>=HtmlOptionId::CONTEXT_START &&
114                  nToken<HtmlOptionId::CONTEXT_END) ||
115                 nToken==HtmlOptionId::VALUE,
116         "GetNumber: Option not numerical" );
117     OUString aTmp(comphelper::string::stripStart(aValue, ' '));
118     sal_Int32 nTmp = aTmp.toInt32();
119     return nTmp >= 0 ? static_cast<sal_uInt32>(nTmp) : 0;
120 }
121 
GetSNumber() const122 sal_Int32 HTMLOption::GetSNumber() const
123 {
124     DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START && nToken<HtmlOptionId::NUMBER_END) ||
125                 (nToken>=HtmlOptionId::CONTEXT_START && nToken<HtmlOptionId::CONTEXT_END),
126         "GetSNumber: Option not numerical" );
127     OUString aTmp(comphelper::string::stripStart(aValue, ' '));
128     return aTmp.toInt32();
129 }
130 
GetNumbers(std::vector<sal_uInt32> & rNumbers) const131 void HTMLOption::GetNumbers( std::vector<sal_uInt32> &rNumbers ) const
132 {
133     rNumbers.clear();
134 
135     // This is a very simplified scanner: it only searches all
136     // numerals in the string.
137     bool bInNum = false;
138     sal_uInt32 nNum = 0;
139     for( sal_Int32 i=0; i<aValue.getLength(); i++ )
140     {
141         sal_Unicode c = aValue[ i ];
142         if( c>='0' && c<='9' )
143         {
144             nNum *= 10;
145             nNum += (c - '0');
146             bInNum = true;
147         }
148         else if( bInNum )
149         {
150             rNumbers.push_back( nNum );
151             bInNum = false;
152             nNum = 0;
153         }
154     }
155     if( bInNum )
156     {
157         rNumbers.push_back( nNum );
158     }
159 }
160 
GetColor(Color & rColor) const161 void HTMLOption::GetColor( Color& rColor ) const
162 {
163     DBG_ASSERT( (nToken>=HtmlOptionId::COLOR_START && nToken<HtmlOptionId::COLOR_END) || nToken==HtmlOptionId::SIZE,
164         "GetColor: Option is not a color." );
165 
166     OUString aTmp(aValue.toAsciiLowerCase());
167     sal_uInt32 nColor = SAL_MAX_UINT32;
168     if (!aTmp.isEmpty() && aTmp[0] != '#')
169         nColor = GetHTMLColor(aTmp);
170 
171     if( SAL_MAX_UINT32 == nColor )
172     {
173         nColor = 0;
174         sal_Int32 nPos = 0;
175         for (sal_uInt32 i=0; i<6; ++i)
176         {
177             // Whatever Netscape does to get color values,
178             // at maximum three characters < '0' are ignored.
179             sal_Unicode c = nPos<aTmp.getLength() ? aTmp[ nPos++ ] : '0';
180             if( c < '0' )
181             {
182                 c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
183                 if( c < '0' )
184                     c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
185             }
186             nColor *= 16;
187             if( c >= '0' && c <= '9' )
188                 nColor += (c - '0');
189             else if( c >= 'a' && c <= 'f' )
190                 nColor += (c + 0xa - 'a');
191         }
192     }
193 
194     rColor.SetRed(   static_cast<sal_uInt8>((nColor & 0x00ff0000) >> 16) );
195     rColor.SetGreen( static_cast<sal_uInt8>((nColor & 0x0000ff00) >> 8));
196     rColor.SetBlue(  static_cast<sal_uInt8>(nColor & 0x000000ff) );
197 }
198 
GetInputType() const199 HTMLInputType HTMLOption::GetInputType() const
200 {
201     DBG_ASSERT( nToken==HtmlOptionId::TYPE, "GetInputType: Option not TYPE" );
202     return GetEnum( aInputTypeOptEnums, HTMLInputType::Text );
203 }
204 
GetTableFrame() const205 HTMLTableFrame HTMLOption::GetTableFrame() const
206 {
207     DBG_ASSERT( nToken==HtmlOptionId::FRAME, "GetTableFrame: Option not FRAME" );
208     return GetEnum( aTableFrameOptEnums );
209 }
210 
GetTableRules() const211 HTMLTableRules HTMLOption::GetTableRules() const
212 {
213     DBG_ASSERT( nToken==HtmlOptionId::RULES, "GetTableRules: Option not RULES" );
214     return GetEnum( aTableRulesOptEnums );
215 }
216 
HTMLParser(SvStream & rIn,bool bReadNewDoc)217 HTMLParser::HTMLParser( SvStream& rIn, bool bReadNewDoc ) :
218     SvParser<HtmlTokenId>( rIn ),
219     bNewDoc(bReadNewDoc),
220     bIsInHeader(true),
221     bReadListing(false),
222     bReadXMP(false),
223     bReadPRE(false),
224     bReadTextArea(false),
225     bReadScript(false),
226     bReadStyle(false),
227     bEndTokenFound(false),
228     bPre_IgnoreNewPara(false),
229     bReadNextChar(false),
230     bReadComment(false),
231     nPre_LinePos(0),
232     mnPendingOffToken(HtmlTokenId::NONE)
233 {
234     //#i76649, default to UTF-8 for HTML unless we know differently
235     SetSrcEncoding(RTL_TEXTENCODING_UTF8);
236 }
237 
~HTMLParser()238 HTMLParser::~HTMLParser()
239 {
240 }
241 
SetNamespace(std::u16string_view rNamespace)242 void HTMLParser::SetNamespace(std::u16string_view rNamespace)
243 {
244     // Convert namespace alias to a prefix.
245     maNamespace = OUString::Concat(rNamespace) + ":";
246 }
247 
248 namespace
249 {
250     class RefGuard
251     {
252     private:
253         HTMLParser& m_rParser;
254     public:
RefGuard(HTMLParser & rParser)255         RefGuard(HTMLParser& rParser)
256             : m_rParser(rParser)
257         {
258             m_rParser.AddFirstRef();
259         }
260 
~RefGuard()261         ~RefGuard()
262         {
263             if (m_rParser.GetStatus() != SvParserState::Pending)
264                 m_rParser.ReleaseRef(); // Parser not needed anymore
265         }
266     };
267 }
268 
CallParser()269 SvParserState HTMLParser::CallParser()
270 {
271     eState = SvParserState::Working;
272     nNextCh = GetNextChar();
273     SaveState( HtmlTokenId::NONE );
274 
275     nPre_LinePos = 0;
276     bPre_IgnoreNewPara = false;
277 
278     RefGuard aRefGuard(*this);
279 
280     Continue( HtmlTokenId::NONE );
281 
282     return eState;
283 }
284 
Continue(HtmlTokenId nToken)285 void HTMLParser::Continue( HtmlTokenId nToken )
286 {
287     if( nToken == HtmlTokenId::NONE )
288         nToken = GetNextToken();
289 
290     while( IsParserWorking() )
291     {
292         SaveState( nToken );
293         nToken = FilterToken( nToken );
294 
295         if( nToken != HtmlTokenId::NONE )
296             NextToken( nToken );
297 
298         if( IsParserWorking() )
299             SaveState( HtmlTokenId::NONE );         // continue with new token
300 
301         nToken = GetNextToken();
302     }
303 }
304 
FilterToken(HtmlTokenId nToken)305 HtmlTokenId HTMLParser::FilterToken( HtmlTokenId nToken )
306 {
307     switch( nToken )
308     {
309     case HtmlTokenId(EOF):
310         nToken = HtmlTokenId::NONE;
311         break;          // don't pass
312 
313     case HtmlTokenId::HEAD_OFF:
314         bIsInHeader = false;
315         break;
316 
317     case HtmlTokenId::HEAD_ON:
318         bIsInHeader = true;
319         break;
320 
321     case HtmlTokenId::BODY_ON:
322         bIsInHeader = false;
323         break;
324 
325     case HtmlTokenId::FRAMESET_ON:
326         bIsInHeader = false;
327         break;
328 
329     case HtmlTokenId::BODY_OFF:
330         bReadPRE = bReadListing = bReadXMP = false;
331         break;
332 
333     case HtmlTokenId::HTML_OFF:
334         nToken = HtmlTokenId::NONE;
335         bReadPRE = bReadListing = bReadXMP = false;
336         break;      // HtmlTokenId::ON hasn't been passed either !
337 
338     case HtmlTokenId::PREFORMTXT_ON:
339         StartPRE();
340         break;
341 
342     case HtmlTokenId::PREFORMTXT_OFF:
343         FinishPRE();
344         break;
345 
346     case HtmlTokenId::LISTING_ON:
347         StartListing();
348         break;
349 
350     case HtmlTokenId::LISTING_OFF:
351         FinishListing();
352         break;
353 
354     case HtmlTokenId::XMP_ON:
355         StartXMP();
356         break;
357 
358     case HtmlTokenId::XMP_OFF:
359         FinishXMP();
360         break;
361 
362     default:
363         if( bReadPRE )
364             nToken = FilterPRE( nToken );
365         else if( bReadListing )
366             nToken = FilterListing( nToken );
367         else if( bReadXMP )
368             nToken = FilterXMP( nToken );
369 
370         break;
371     }
372 
373     return nToken;
374 }
375 
376 namespace {
377 
HTML_ISPRINTABLE(sal_Unicode c)378 constexpr bool HTML_ISPRINTABLE(sal_Unicode c) { return c >= 32 && c != 127; }
379 
HTML_ISSPACE(sal_uInt32 c)380 constexpr bool HTML_ISSPACE(sal_uInt32 c)
381 {
382     return ' ' == c || '\t' == c || '\r' == c || '\n' == c || '\x0b' == c;
383 }
384 
385 }
386 
ScanText(const sal_Unicode cBreak)387 HtmlTokenId HTMLParser::ScanText(const sal_Unicode cBreak)
388 {
389     OUStringBuffer sTmpBuffer( MAX_LEN );
390     bool bContinue = true;
391     bool bEqSignFound = false;
392     sal_uInt32  cQuote = 0U;
393 
394     while( bContinue && IsParserWorking() )
395     {
396         bool bNextCh = true;
397         switch( nNextCh )
398         {
399         case '&':
400             bEqSignFound = false;
401             if( bReadXMP )
402                 sTmpBuffer.append( '&' );
403             else
404             {
405                 sal_uInt64 nStreamPos = rInput.Tell();
406                 sal_uInt32 nLinePos = GetLinePos();
407 
408                 sal_uInt32 cChar = 0U;
409                 if( '#' == (nNextCh = GetNextChar()) )
410                 {
411                     nNextCh = GetNextChar();
412                     const bool bIsHex( 'x' == nNextCh );
413                     const bool bIsDecOrHex( bIsHex || rtl::isAsciiDigit(nNextCh) );
414                     if ( bIsDecOrHex )
415                     {
416                         if ( bIsHex )
417                         {
418                             nNextCh = GetNextChar();
419                             while ( rtl::isAsciiHexDigit(nNextCh) )
420                             {
421                                 cChar = cChar * 16U +
422                                         ( nNextCh <= '9'
423                                           ? sal_uInt32( nNextCh - '0' )
424                                           : ( nNextCh <= 'F'
425                                               ? sal_uInt32( nNextCh - 'A' + 10 )
426                                               : sal_uInt32( nNextCh - 'a' + 10 ) ) );
427                                 nNextCh = GetNextChar();
428                             }
429                         }
430                         else
431                         {
432                             do
433                             {
434                                 cChar = cChar * 10U + sal_uInt32( nNextCh - '0');
435                                 nNextCh = GetNextChar();
436                             }
437                             while( rtl::isAsciiDigit(nNextCh) );
438                         }
439 
440                         if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
441                             RTL_TEXTENCODING_UCS2 != eSrcEnc &&
442                             RTL_TEXTENCODING_UTF8 != eSrcEnc &&
443                             cChar < 256 )
444                         {
445                             const sal_uInt32 convertFlags =
446                                 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
447                                 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
448                                 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT;
449 
450                             char cEncodedChar = static_cast<char>(cChar);
451                             cChar = OUString(&cEncodedChar, 1, eSrcEnc, convertFlags).toChar();
452                             if( 0U == cChar )
453                             {
454                                 // If the character could not be
455                                 // converted, because a conversion is not
456                                 // available, do no conversion at all.
457                                 cChar = cEncodedChar;
458                             }
459                         }
460                     }
461                     else
462                         nNextCh = 0U;
463 
464                     if (!rtl::isUnicodeCodePoint(cChar)
465                         || (linguistic::IsControlChar(cChar)
466                             && cChar != '\r' && cChar != '\n' && cChar != '\t'))
467                     {
468                         cChar = '?';
469                     }
470                 }
471                 else if( rtl::isAsciiAlpha( nNextCh ) )
472                 {
473                     OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
474                     sal_Int32 nPos = 0;
475                     do
476                     {
477                         sEntityBuffer.appendUtf32( nNextCh );
478                         nPos++;
479                         nNextCh = GetNextChar();
480                     }
481                     while( nPos < MAX_ENTITY_LEN && rtl::isAsciiAlphanumeric( nNextCh ) &&
482                            !rInput.eof() );
483 
484                     if( IsParserWorking() && !rInput.eof() )
485                     {
486                         std::u16string_view sEntity(sEntityBuffer.subView(0, nPos));
487                         cChar = GetHTMLCharName( sEntity );
488 
489                         // not found ( == 0 ): plain text
490                         // or a character which is inserted as attribute
491                         if( 0U == cChar && ';' != nNextCh )
492                         {
493                             DBG_ASSERT( rInput.Tell() - nStreamPos ==
494                                         static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
495                                         "UTF-8 is failing here" );
496                             for( sal_Int32 i = nPos-1; i>1; i-- )
497                             {
498                                 nNextCh = sEntityBuffer[i];
499                                 sEntityBuffer.setLength( i );
500                                 sEntity = sEntityBuffer.subView(0, i);
501                                 cChar = GetHTMLCharName( sEntity );
502                                 if( cChar )
503                                 {
504                                     rInput.SeekRel( -static_cast<sal_Int64>
505                                             (nPos-i)*GetCharSize() );
506                                     nlLinePos -= sal_uInt32(nPos-i);
507                                     nPos = i;
508                                     ClearTxtConvContext();
509                                     break;
510                                 }
511                             }
512                         }
513 
514                         if( !cChar )        // unknown character?
515                         {
516                             // back in stream, insert '&'
517                             // and restart with next character
518                             sTmpBuffer.append( '&' );
519 
520                             DBG_ASSERT( rInput.Tell()-nStreamPos ==
521                                         static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
522                                         "Wrong stream position" );
523                             DBG_ASSERT( nlLinePos-nLinePos ==
524                                         static_cast<sal_uInt32>(nPos+1),
525                                         "Wrong line position" );
526                             rInput.Seek( nStreamPos );
527                             nlLinePos = nLinePos;
528                             ClearTxtConvContext();
529                             break;
530                         }
531 
532                         assert(cChar != 0);
533 
534                         // 1 == Non Breaking Space
535                         // 2 == SoftHyphen
536 
537                         if (cChar == 1 || cChar == 2)
538                         {
539                             if( '>' == cBreak )
540                             {
541                                 // When reading the content of a tag we have
542                                 // to change it to ' ' or '-'
543                                 if( 1U == cChar )
544                                     cChar = ' ';
545                                 else //2U
546                                     cChar = '-';
547                             }
548                             else
549                             {
550                                 // If not scanning a tag return token
551                                 aToken.append( sTmpBuffer );
552                                 sTmpBuffer.setLength(0);
553 
554                                 if( !aToken.isEmpty() )
555                                 {
556                                     // restart with character
557                                     nNextCh = '&';
558                                     DBG_ASSERT( rInput.Tell()-nStreamPos ==
559                                                 static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
560                                                 "Wrong stream position" );
561                                     DBG_ASSERT( nlLinePos-nLinePos ==
562                                                 static_cast<sal_uInt32>(nPos+1),
563                                                 "Wrong line position" );
564                                     rInput.Seek( nStreamPos );
565                                     nlLinePos = nLinePos;
566                                     ClearTxtConvContext();
567                                     return HtmlTokenId::TEXTTOKEN;
568                                 }
569 
570                                 // Hack: _GetNextChar shall not read the
571                                 // next character
572                                 if( ';' != nNextCh )
573                                     aToken.append( " " );
574                                 if( 1U == cChar )
575                                     return HtmlTokenId::NONBREAKSPACE;
576                                 else //2U
577                                     return HtmlTokenId::SOFTHYPH;
578                             }
579                         }
580                     }
581                     else
582                         nNextCh = 0U;
583                 }
584                 // &{...};-JavaScript-Macros are not supported any longer.
585                 else if( IsParserWorking() )
586                 {
587                     sTmpBuffer.append( '&' );
588                     bNextCh = false;
589                     break;
590                 }
591 
592                 bNextCh = (';' == nNextCh);
593                 if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
594                                     cChar=='\"' || cChar==' ') )
595                 {
596                     // ' and " have to be escaped within tags to separate
597                     // them from ' and " enclosing options.
598                     // \ has to be escaped as well.
599                     // Space is protected because it's not a delimiter between
600                     // options.
601                     sTmpBuffer.append( '\\' );
602                 }
603                 if( IsParserWorking() )
604                 {
605                     if( cChar )
606                         sTmpBuffer.appendUtf32( cChar );
607                 }
608                 else if( SvParserState::Pending==eState && '>'!=cBreak )
609                 {
610                     // Restart with '&', the remainder is returned as
611                     // text token.
612                     if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
613                     {
614                         // _GetNextChar() returns the previous text and
615                         // during the next execution a new character is read.
616                         // Thus we have to position in front of the '&'.
617                         nNextCh = 0U;
618                         rInput.Seek( nStreamPos - GetCharSize() );
619                         nlLinePos = nLinePos-1;
620                         ClearTxtConvContext();
621                         bReadNextChar = true;
622                     }
623                     bNextCh = false;
624                 }
625             }
626             break;
627         case '=':
628             if( '>'==cBreak && !cQuote )
629                 bEqSignFound = true;
630             sTmpBuffer.appendUtf32( nNextCh );
631             break;
632 
633         case '\\':
634             if( '>'==cBreak )
635             {
636                 // mark within tags
637                 sTmpBuffer.append( '\\' );
638             }
639             sTmpBuffer.append( '\\' );
640             break;
641 
642         case '\"':
643         case '\'':
644             if( '>'==cBreak )
645             {
646                 if( bEqSignFound )
647                     cQuote = nNextCh;
648                 else if( cQuote && (cQuote==nNextCh ) )
649                     cQuote = 0U;
650             }
651             sTmpBuffer.appendUtf32( nNextCh );
652             bEqSignFound = false;
653             break;
654 
655         case sal_Unicode(EOF):
656             if( rInput.eof() )
657             {
658                 bContinue = false;
659             }
660             // else: ignore, not a valid code point
661             break;
662 
663         case '<':
664             bEqSignFound = false;
665             if( '>'==cBreak )
666                 sTmpBuffer.appendUtf32( nNextCh );
667             else
668                 bContinue = false;      // break, string is together
669             break;
670 
671         case '\f':
672             if( '>' == cBreak )
673             {
674                 // If scanning options treat it like a space, ...
675                 sTmpBuffer.append( ' ' );
676             }
677             else
678             {
679                 // otherwise it's a separate token.
680                 bContinue = false;
681             }
682             break;
683 
684         case '\r':
685         case '\n':
686             if( '>'==cBreak )
687             {
688                 // cr/lf in tag is handled in GetNextToken_()
689                 sTmpBuffer.appendUtf32( nNextCh );
690                 break;
691             }
692             else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
693             {
694                 bContinue = false;
695                 break;
696             }
697             // Reduce sequence of CR/LF/BLANK/TAB to a single blank
698             [[fallthrough]];
699         case '\t':
700             if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
701             {
702                 // Pass Tabs up in <PRE>
703                 bContinue = false;
704                 break;
705             }
706             [[fallthrough]];
707         case '\x0b':
708             if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
709                 '>'!=cBreak )
710             {
711                 break;
712             }
713             if (!m_bPreserveSpaces)
714                 nNextCh = ' ';
715             [[fallthrough]];
716         case ' ':
717             if (!m_bPreserveSpaces)
718             {
719                 sTmpBuffer.appendUtf32(nNextCh);
720                 if ('>' != cBreak && (!bReadListing && !bReadXMP && !bReadPRE && !bReadTextArea))
721                 {
722                     // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
723                     do
724                     {
725                         nNextCh = GetNextChar();
726                         if (sal_Unicode(EOF) == nNextCh && rInput.eof())
727                         {
728                             if (!aToken.isEmpty() || sTmpBuffer.getLength() > 1)
729                             {
730                                 // Have seen s.th. aside from blanks?
731                                 aToken.append(sTmpBuffer);
732                                 sTmpBuffer.setLength(0);
733                                 return HtmlTokenId::TEXTTOKEN;
734                             }
735                             else
736                                 // Only read blanks: no text must be returned
737                                 // and GetNextToken_ has to read until EOF
738                                 return HtmlTokenId::NONE;
739                         }
740                     } while (HTML_ISSPACE(nNextCh));
741                     bNextCh = false;
742                 }
743                 break;
744             }
745             [[fallthrough]];
746         default:
747             bEqSignFound = false;
748             if (nNextCh == cBreak && !cQuote)
749                 bContinue = false;
750             else
751             {
752                 do {
753                     if (!linguistic::IsControlChar(nNextCh) || HTML_ISSPACE(nNextCh))
754                     {
755                     // All remaining characters make their way into the text.
756                         sTmpBuffer.appendUtf32( nNextCh );
757                     }
758 
759                     nNextCh = GetNextChar();
760                     if( ( sal_Unicode(EOF) == nNextCh && rInput.eof() ) ||
761                         !IsParserWorking() )
762                     {
763                         if( !sTmpBuffer.isEmpty() )
764                             aToken.append( sTmpBuffer );
765                         return HtmlTokenId::TEXTTOKEN;
766                     }
767                 } while( rtl::isAsciiAlpha( nNextCh ) || rtl::isAsciiDigit( nNextCh ) );
768                 bNextCh = false;
769             }
770         }
771 
772         if( bContinue && bNextCh )
773             nNextCh = GetNextChar();
774     }
775 
776     if( !sTmpBuffer.isEmpty() )
777         aToken.append( sTmpBuffer );
778 
779     return HtmlTokenId::TEXTTOKEN;
780 }
781 
GetNextRawToken()782 HtmlTokenId HTMLParser::GetNextRawToken()
783 {
784     OUStringBuffer sTmpBuffer( MAX_LEN );
785 
786     if( bEndTokenFound )
787     {
788         // During the last execution we already found the end token,
789         // thus we don't have to search it again.
790         bReadScript = false;
791         bReadStyle = false;
792         aEndToken.clear();
793         bEndTokenFound = false;
794 
795         return HtmlTokenId::NONE;
796     }
797 
798     // Default return value: HtmlTokenId::RAWDATA
799     bool bContinue = true;
800     HtmlTokenId nToken = HtmlTokenId::RAWDATA;
801     SaveState( HtmlTokenId::NONE );
802     while( bContinue && IsParserWorking() )
803     {
804         bool bNextCh = true;
805         switch( nNextCh )
806         {
807         case '<':
808             {
809                 // Maybe we've reached the end.
810 
811                 // Save what we have read previously...
812                 aToken.append( sTmpBuffer );
813                 sTmpBuffer.setLength(0);
814 
815                 // and remember position in stream.
816                 sal_uInt64 nStreamPos = rInput.Tell();
817                 sal_uInt32 nLineNr = GetLineNr();
818                 sal_uInt32 nLinePos = GetLinePos();
819 
820                 // Start of an end token?
821                 bool bOffState = false;
822                 if( '/' == (nNextCh = GetNextChar()) )
823                 {
824                     bOffState = true;
825                     nNextCh = GetNextChar();
826                 }
827                 else if( '!' == nNextCh )
828                 {
829                     sTmpBuffer.appendUtf32( nNextCh );
830                     nNextCh = GetNextChar();
831                 }
832 
833                 // Read following letters
834                 while( (rtl::isAsciiAlpha(nNextCh) || '-'==nNextCh) &&
835                        IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
836                 {
837                     sTmpBuffer.appendUtf32( nNextCh );
838                     nNextCh = GetNextChar();
839                 }
840 
841                 OUString aTok( sTmpBuffer.toString() );
842                 aTok = aTok.toAsciiLowerCase();
843                 bool bDone = false;
844                 if( bReadScript || !aEndToken.isEmpty() )
845                 {
846                     if( !bReadComment )
847                     {
848                         if( aTok.startsWith( OOO_STRING_SVTOOLS_HTML_comment ) )
849                         {
850                             bReadComment = true;
851                         }
852                         else
853                         {
854                             // A script has to end with "</SCRIPT>". But
855                             // ">" is optional for security reasons
856                             bDone = bOffState &&
857                             ( bReadScript
858                                 ? aTok == OOO_STRING_SVTOOLS_HTML_script
859                                 : aTok == aEndToken );
860                         }
861                     }
862                     if( bReadComment && '>'==nNextCh && aTok.endsWith( "--" ) )
863                     {
864                         // End of comment of style <!----->
865                         bReadComment = false;
866                     }
867                 }
868                 else
869                 {
870                     // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
871                     if( bOffState )
872                         bDone = aTok == OOO_STRING_SVTOOLS_HTML_style ||
873                                 aTok == OOO_STRING_SVTOOLS_HTML_head;
874                     else
875                         bDone = aTok == OOO_STRING_SVTOOLS_HTML_body;
876                 }
877 
878                 if( bDone )
879                 {
880                     // Done! Return the previously read string (if requested)
881                     // and continue.
882 
883                     bContinue = false;
884 
885                     // nToken==0 means, GetNextToken_ continues to read
886                     if( aToken.isEmpty() && (bReadStyle || bReadScript) )
887                     {
888                         // Immediately close environment (or context?)
889                         // and parse the end token
890                         bReadScript = false;
891                         bReadStyle = false;
892                         aEndToken.clear();
893                         nToken = HtmlTokenId::NONE;
894                     }
895                     else
896                     {
897                         // Keep bReadScript/bReadStyle alive
898                         // and parse end token during next execution
899                         bEndTokenFound = true;
900                     }
901 
902                     // Move backwards in stream to '<'
903                     rInput.Seek( nStreamPos );
904                     SetLineNr( nLineNr );
905                     SetLinePos( nLinePos );
906                     ClearTxtConvContext();
907                     nNextCh = '<';
908 
909                     // Don't append string to token.
910                     sTmpBuffer.setLength( 0 );
911                 }
912                 else
913                 {
914                     // remember "</" , everything else we find in the buffer
915                     aToken.append( "<" );
916                     if( bOffState )
917                         aToken.append( "/" );
918 
919                     bNextCh = false;
920                 }
921             }
922             break;
923         case '-':
924             sTmpBuffer.appendUtf32( nNextCh );
925             if( bReadComment )
926             {
927                 bool bTwoMinus = false;
928                 nNextCh = GetNextChar();
929                 while( '-' == nNextCh && IsParserWorking() )
930                 {
931                     bTwoMinus = true;
932                     sTmpBuffer.appendUtf32( nNextCh );
933                     nNextCh = GetNextChar();
934                 }
935 
936                 if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
937                     bReadComment = false;
938 
939                 bNextCh = false;
940             }
941             break;
942 
943         case '\r':
944             // \r\n? closes the current text token (even if it's empty)
945             nNextCh = GetNextChar();
946             if( nNextCh=='\n' )
947                 nNextCh = GetNextChar();
948             bContinue = false;
949             break;
950         case '\n':
951             // \n closes the current text token (even if it's empty)
952             nNextCh = GetNextChar();
953             bContinue = false;
954             break;
955         case sal_Unicode(EOF):
956             // eof closes the current text token and behaves like having read
957             // an end token
958             if( rInput.eof() )
959             {
960                 bContinue = false;
961                 if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
962                 {
963                     bEndTokenFound = true;
964                 }
965                 else
966                 {
967                     bReadScript = false;
968                     bReadStyle = false;
969                     aEndToken.clear();
970                     nToken = HtmlTokenId::NONE;
971                 }
972             }
973             break;
974         default:
975             if (!linguistic::IsControlChar(nNextCh) || nNextCh == '\t')
976             {
977                 // all remaining characters are appended to the buffer
978                 sTmpBuffer.appendUtf32( nNextCh );
979             }
980             break;
981         }
982 
983         if( !bContinue && !sTmpBuffer.isEmpty() )
984         {
985             aToken.append( sTmpBuffer );
986             sTmpBuffer.setLength(0);
987         }
988 
989         if( bContinue && bNextCh )
990             nNextCh = GetNextChar();
991     }
992 
993     if( IsParserWorking() )
994         SaveState( HtmlTokenId::NONE );
995     else
996         nToken = HtmlTokenId::NONE;
997 
998     return nToken;
999 }
1000 
1001 // Scan next token
GetNextToken_()1002 HtmlTokenId HTMLParser::GetNextToken_()
1003 {
1004     HtmlTokenId nRet = HtmlTokenId::NONE;
1005     sSaveToken.clear();
1006 
1007     if (mnPendingOffToken != HtmlTokenId::NONE)
1008     {
1009         // HtmlTokenId::<TOKEN>_OFF generated for HtmlTokenId::<TOKEN>_ON
1010         nRet = mnPendingOffToken;
1011         mnPendingOffToken = HtmlTokenId::NONE;
1012         aToken.setLength( 0 );
1013         return nRet;
1014     }
1015 
1016     // Delete options
1017     maOptions.clear();
1018 
1019     if( !IsParserWorking() )        // Don't continue if already an error occurred
1020         return HtmlTokenId::NONE;
1021 
1022     bool bReadNextCharSave = bReadNextChar;
1023     if( bReadNextChar )
1024     {
1025         DBG_ASSERT( !bEndTokenFound,
1026                     "Read a character despite </SCRIPT> was read?" );
1027         nNextCh = GetNextChar();
1028         if( !IsParserWorking() )        // Don't continue if already an error occurred
1029             return HtmlTokenId::NONE;
1030         bReadNextChar = false;
1031     }
1032 
1033     if( bReadScript || bReadStyle || !aEndToken.isEmpty() )
1034     {
1035         nRet = GetNextRawToken();
1036         if( nRet != HtmlTokenId::NONE || !IsParserWorking() )
1037             return nRet;
1038     }
1039 
1040     do {
1041         bool bNextCh = true;
1042         switch( nNextCh )
1043         {
1044         case '<':
1045             {
1046                 sal_uInt64 nStreamPos = rInput.Tell();
1047                 sal_uInt32 nLineNr = GetLineNr();
1048                 sal_uInt32 nLinePos = GetLinePos();
1049 
1050                 bool bOffState = false;
1051                 if( '/' == (nNextCh = GetNextChar()) )
1052                 {
1053                     bOffState = true;
1054                     nNextCh = GetNextChar();
1055                 }
1056                 // Assume '<?' is a start of an XML declaration, ignore it.
1057                 if (rtl::isAsciiAlpha(nNextCh) || nNextCh == '!' || nNextCh == '?')
1058                 {
1059                     OUStringBuffer sTmpBuffer;
1060                     do {
1061                         sTmpBuffer.appendUtf32( nNextCh );
1062                         nNextCh = GetNextChar();
1063                         if (std::u16string_view(sTmpBuffer) == u"![CDATA[")
1064                             break;
1065                         if (bFuzzing && sTmpBuffer.getLength() > 1024)
1066                         {
1067                             SAL_WARN("svtools", "abandoning import for performance reasons with long tokens");
1068                             eState = SvParserState::Error;
1069                             break;
1070                         }
1071                     } while( '>' != nNextCh && '/' != nNextCh && !rtl::isAsciiWhiteSpace( nNextCh ) &&
1072                             !linguistic::IsControlChar(nNextCh) &&
1073                              IsParserWorking() && !rInput.eof() );
1074 
1075                     if( !sTmpBuffer.isEmpty() )
1076                     {
1077                         aToken.append( sTmpBuffer );
1078                         sTmpBuffer.setLength(0);
1079                     }
1080 
1081                     // Skip blanks
1082                     while( rtl::isAsciiWhiteSpace( nNextCh ) && IsParserWorking() )
1083                         nNextCh = GetNextChar();
1084 
1085                     if( !IsParserWorking() )
1086                     {
1087                         if( SvParserState::Pending == eState )
1088                             bReadNextChar = bReadNextCharSave;
1089                         break;
1090                     }
1091 
1092                     // Search token in table:
1093                     sSaveToken = aToken;
1094                     aToken = aToken.toString().toAsciiLowerCase();
1095 
1096                     if (!maNamespace.isEmpty() && o3tl::starts_with(aToken, maNamespace))
1097                         aToken.remove( 0, maNamespace.getLength());
1098 
1099                     if( HtmlTokenId::NONE == (nRet = GetHTMLToken( aToken )) )
1100                         // Unknown control
1101                         nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
1102 
1103                     // If it's a token which can be switched off...
1104                     if( bOffState )
1105                     {
1106                          if( nRet >= HtmlTokenId::ONOFF_START )
1107                          {
1108                             // and there is an off token, return off token instead
1109                             nRet = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1);
1110                          }
1111                          else if( HtmlTokenId::LINEBREAK!=nRet || !maNamespace.isEmpty())
1112                          {
1113                             // and there is no off token, return unknown token.
1114                             // (except for </BR>, that is treated like <BR>)
1115                             // No exception for XHTML, though.
1116                             nRet = HtmlTokenId::UNKNOWNCONTROL_OFF;
1117                          }
1118                     }
1119 
1120                     if( nRet == HtmlTokenId::COMMENT )
1121                     {
1122                         // fix: due to being case sensitive use sSaveToken as start of comment
1123                         //      and append a blank.
1124                         aToken = sSaveToken;
1125                         if( '>'!=nNextCh )
1126                             aToken.append( " " );
1127                         sal_uInt64 nCStreamPos = 0;
1128                         sal_uInt32 nCLineNr = 0;
1129                         sal_uInt32 nCLinePos = 0;
1130                         sal_Int32 nCStrLen = 0;
1131 
1132                         bool bDone = false;
1133                         // Read until closing -->. If not found restart at first >
1134                         sTmpBuffer = aToken;
1135                         while( !bDone && !rInput.eof() && IsParserWorking() )
1136                         {
1137                             if( '>'==nNextCh )
1138                             {
1139                                 if( !nCStreamPos )
1140                                 {
1141                                     nCStreamPos = rInput.Tell();
1142                                     nCStrLen = sTmpBuffer.getLength();
1143                                     nCLineNr = GetLineNr();
1144                                     nCLinePos = GetLinePos();
1145                                 }
1146                                 bDone = sTmpBuffer.getLength() >= 2 && sTmpBuffer[sTmpBuffer.getLength() - 2] == '-' && sTmpBuffer[sTmpBuffer.getLength() - 1] == '-';
1147                                 if( !bDone )
1148                                     sTmpBuffer.appendUtf32(nNextCh);
1149                             }
1150                             else if (!linguistic::IsControlChar(nNextCh)
1151                                 || nNextCh == '\r' || nNextCh == '\n' || nNextCh == '\t')
1152                             {
1153                                 sTmpBuffer.appendUtf32(nNextCh);
1154                             }
1155                             if( !bDone )
1156                                 nNextCh = GetNextChar();
1157                         }
1158                         aToken = sTmpBuffer;
1159                         sTmpBuffer.setLength(0);
1160                         if( !bDone && IsParserWorking() && nCStreamPos )
1161                         {
1162                             rInput.Seek( nCStreamPos );
1163                             SetLineNr( nCLineNr );
1164                             SetLinePos( nCLinePos );
1165                             ClearTxtConvContext();
1166                             aToken.truncate(nCStrLen);
1167                             nNextCh = '>';
1168                         }
1169                     }
1170                     else if (nRet == HtmlTokenId::CDATA)
1171                     {
1172                         // Read until the closing ]]>.
1173                         bool bDone = false;
1174                         while (!bDone && !rInput.eof() && IsParserWorking())
1175                         {
1176                             if (nNextCh == '>')
1177                             {
1178                                 if (sTmpBuffer.getLength() >= 2)
1179                                 {
1180                                     bDone = sTmpBuffer[sTmpBuffer.getLength() - 2] == ']'
1181                                             && sTmpBuffer[sTmpBuffer.getLength() - 1] == ']';
1182                                     if (bDone)
1183                                     {
1184                                         // Ignore ]] at the end.
1185                                         sTmpBuffer.setLength(sTmpBuffer.getLength() - 2);
1186                                     }
1187                                 }
1188                                 if (!bDone)
1189                                 {
1190                                     sTmpBuffer.appendUtf32(nNextCh);
1191                                 }
1192                             }
1193                             else if (!linguistic::IsControlChar(nNextCh))
1194                             {
1195                                 sTmpBuffer.appendUtf32(nNextCh);
1196                             }
1197                             if (!bDone)
1198                             {
1199                                 nNextCh = GetNextChar();
1200                             }
1201                         }
1202                         aToken = sTmpBuffer;
1203                         sTmpBuffer.setLength(0);
1204                     }
1205                     else
1206                     {
1207                         // TokenString not needed anymore
1208                         aToken.setLength( 0 );
1209                     }
1210 
1211                     // Read until closing '>'
1212                     if( '>' != nNextCh && IsParserWorking() )
1213                     {
1214                         ScanText( '>' );
1215 
1216                         // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
1217                         // generate pending HtmlTokenId::<TOKEN>_OFF for HtmlTokenId::<TOKEN>_ON
1218                         // Do not convert this to a single HtmlTokenId::<TOKEN>_OFF
1219                         // which lead to fdo#56772.
1220                         if ((nRet >= HtmlTokenId::ONOFF_START) && o3tl::ends_with(aToken, u"/"))
1221                         {
1222                             mnPendingOffToken = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1);       // HtmlTokenId::<TOKEN>_ON -> HtmlTokenId::<TOKEN>_OFF
1223                             aToken.setLength( aToken.getLength()-1 );   // remove trailing '/'
1224                         }
1225                         if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
1226                         {
1227                             // Move back in front of < and restart there.
1228                             // Return < as text.
1229                             rInput.Seek( nStreamPos );
1230                             SetLineNr( nLineNr );
1231                             SetLinePos( nLinePos );
1232                             ClearTxtConvContext();
1233 
1234                             aToken = "<";
1235                             nRet = HtmlTokenId::TEXTTOKEN;
1236                             nNextCh = GetNextChar();
1237                             bNextCh = false;
1238                             break;
1239                         }
1240                     }
1241                     if( SvParserState::Pending == eState )
1242                         bReadNextChar = bReadNextCharSave;
1243                 }
1244                 else
1245                 {
1246                     if( bOffState )
1247                     {
1248                         // simply throw away everything
1249                         ScanText( '>' );
1250                         if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
1251                         {
1252                             // Move back in front of < and restart there.
1253                             // Return < as text.
1254                             rInput.Seek( nStreamPos );
1255                             SetLineNr( nLineNr );
1256                             SetLinePos( nLinePos );
1257                             ClearTxtConvContext();
1258 
1259                             aToken = "<";
1260                             nRet = HtmlTokenId::TEXTTOKEN;
1261                             nNextCh = GetNextChar();
1262                             bNextCh = false;
1263                             break;
1264                         }
1265                         if( SvParserState::Pending == eState )
1266                             bReadNextChar = bReadNextCharSave;
1267                         aToken.setLength( 0 );
1268                     }
1269                     else if( '%' == nNextCh )
1270                     {
1271                         nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
1272 
1273                         sal_uInt64 nCStreamPos = rInput.Tell();
1274                         sal_uInt32 nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
1275 
1276                         bool bDone = false;
1277                         // Read until closing %>. If not found restart at first >.
1278                         sal_Unicode nLastTokenChar = !aToken.isEmpty() ? aToken[aToken.getLength() - 1] : 0;
1279                         OUStringBuffer aTmpBuffer(aToken);
1280                         while( !bDone && !rInput.eof() && IsParserWorking() )
1281                         {
1282                             bDone = '>'==nNextCh && nLastTokenChar == '%';
1283                             if( !bDone )
1284                             {
1285                                 aTmpBuffer.appendUtf32(nNextCh);
1286                                 nLastTokenChar = aTmpBuffer[aTmpBuffer.getLength() - 1];
1287                                 nNextCh = GetNextChar();
1288                             }
1289                         }
1290                         if( !bDone && IsParserWorking() )
1291                         {
1292                             rInput.Seek( nCStreamPos );
1293                             SetLineNr( nCLineNr );
1294                             SetLinePos( nCLinePos );
1295                             ClearTxtConvContext();
1296                             aToken = "<%";
1297                             nRet = HtmlTokenId::TEXTTOKEN;
1298                             break;
1299                         }
1300                         aToken = aTmpBuffer;
1301                         aTmpBuffer.setLength(0);
1302                         if( IsParserWorking() )
1303                         {
1304                             sSaveToken = aToken;
1305                             aToken.setLength( 0 );
1306                         }
1307                     }
1308                     else
1309                     {
1310                         aToken = "<";
1311                         nRet = HtmlTokenId::TEXTTOKEN;
1312                         bNextCh = false;
1313                         break;
1314                     }
1315                 }
1316 
1317                 if( IsParserWorking() )
1318                 {
1319                     bNextCh = '>' == nNextCh;
1320                     switch( nRet )
1321                     {
1322                     case HtmlTokenId::TEXTAREA_ON:
1323                         bReadTextArea = true;
1324                         break;
1325                     case HtmlTokenId::TEXTAREA_OFF:
1326                         bReadTextArea = false;
1327                         break;
1328                     case HtmlTokenId::SCRIPT_ON:
1329                         if( !bReadTextArea )
1330                             bReadScript = true;
1331                         break;
1332                     case HtmlTokenId::SCRIPT_OFF:
1333                         if( !bReadTextArea )
1334                         {
1335                             bReadScript = false;
1336                             // JavaScript might modify the stream,
1337                             // thus the last character has to be read again.
1338                             bReadNextChar = true;
1339                             bNextCh = false;
1340                         }
1341                         break;
1342 
1343                     case HtmlTokenId::STYLE_ON:
1344                         bReadStyle = true;
1345                         break;
1346                     case HtmlTokenId::STYLE_OFF:
1347                         bReadStyle = false;
1348                         break;
1349                     default: break;
1350                     }
1351                 }
1352             }
1353             break;
1354 
1355         case sal_Unicode(EOF):
1356             if( rInput.eof() )
1357             {
1358                 eState = SvParserState::Accepted;
1359                 nRet = HtmlTokenId(nNextCh);
1360             }
1361             else
1362             {
1363                 // Read normal text.
1364                 goto scan_text;
1365             }
1366             break;
1367 
1368         case '\f':
1369             // form feeds are passed upwards separately
1370             nRet = HtmlTokenId::LINEFEEDCHAR; // !!! should be FORMFEEDCHAR
1371             break;
1372 
1373         case '\n':
1374         case '\r':
1375             if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
1376             {
1377                 sal_Unicode c = GetNextChar();
1378                 if( ( '\n' != nNextCh || '\r' != c ) &&
1379                     ( '\r' != nNextCh || '\n' != c ) )
1380                 {
1381                     bNextCh = false;
1382                     nNextCh = c;
1383                 }
1384                 nRet = HtmlTokenId::NEWPARA;
1385                 break;
1386             }
1387             [[fallthrough]];
1388         case '\t':
1389             if( bReadPRE )
1390             {
1391                 nRet = HtmlTokenId::TABCHAR;
1392                 break;
1393             }
1394             [[fallthrough]];
1395         case ' ':
1396             [[fallthrough]];
1397         default:
1398 
1399 scan_text:
1400             // "normal" text to come
1401             nRet = ScanText();
1402             bNextCh = 0 == aToken.getLength();
1403 
1404             // the text should be processed
1405             if( !bNextCh && eState == SvParserState::Pending )
1406             {
1407                 eState = SvParserState::Working;
1408                 bReadNextChar = true;
1409             }
1410 
1411             break;
1412         }
1413 
1414         if( bNextCh && SvParserState::Working == eState )
1415         {
1416             nNextCh = GetNextChar();
1417             if( SvParserState::Pending == eState && nRet != HtmlTokenId::NONE && HtmlTokenId::TEXTTOKEN != nRet )
1418             {
1419                 bReadNextChar = true;
1420                 eState = SvParserState::Working;
1421             }
1422         }
1423 
1424     } while( nRet == HtmlTokenId::NONE && SvParserState::Working == eState );
1425 
1426     if( SvParserState::Pending == eState )
1427         nRet = HtmlTokenId::INVALID;      // s.th. invalid
1428 
1429     return nRet;
1430 }
1431 
UnescapeToken()1432 void HTMLParser::UnescapeToken()
1433 {
1434     sal_Int32 nPos=0;
1435 
1436     bool bEscape = false;
1437     while( nPos < aToken.getLength() )
1438     {
1439         bool bOldEscape = bEscape;
1440         bEscape = false;
1441         if( '\\'==aToken[nPos] && !bOldEscape )
1442         {
1443             aToken.remove( nPos, 1 );
1444             bEscape = true;
1445         }
1446         else
1447         {
1448             nPos++;
1449         }
1450     }
1451 }
1452 
GetOptions(HtmlOptionId const * pNoConvertToken)1453 const HTMLOptions& HTMLParser::GetOptions( HtmlOptionId const *pNoConvertToken )
1454 {
1455     // If the options for the current token have already been returned,
1456     // return them once again.
1457     if (!maOptions.empty())
1458         return maOptions;
1459 
1460     sal_Int32 nPos = 0;
1461     while( nPos < aToken.getLength() )
1462     {
1463         // A letter? Option beginning here.
1464         if( rtl::isAsciiAlpha( aToken[nPos] ) )
1465         {
1466             HtmlOptionId nToken;
1467             OUString aValue;
1468             sal_Int32 nStt = nPos;
1469             sal_Unicode cChar = 0;
1470 
1471             // Actually only certain characters allowed.
1472             // Netscape only looks for "=" and white space (c.f.
1473             // Mozilla: PA_FetchRequestedNameValues in libparse/pa_mdl.c)
1474             while( nPos < aToken.getLength() )
1475             {
1476                 cChar = aToken[nPos];
1477                 if ( '=' == cChar ||!HTML_ISPRINTABLE(cChar) || rtl::isAsciiWhiteSpace(cChar) )
1478                     break;
1479                 nPos++;
1480             }
1481 
1482             OUString sName( aToken.subView( nStt, nPos-nStt ) );
1483 
1484             // PlugIns require original token name. Convert to lower case only for searching.
1485             nToken = GetHTMLOption( sName.toAsciiLowerCase() ); // Name is ready
1486             SAL_WARN_IF( nToken==HtmlOptionId::UNKNOWN, "svtools",
1487                         "GetOption: unknown HTML option '" << sName << "'" );
1488             bool bStripCRLF = (nToken < HtmlOptionId::SCRIPT_START ||
1489                                nToken >= HtmlOptionId::SCRIPT_END) &&
1490                               (!pNoConvertToken || nToken != *pNoConvertToken);
1491 
1492             while( nPos < aToken.getLength() )
1493             {
1494                 cChar = aToken[nPos];
1495                 if ( HTML_ISPRINTABLE(cChar) && !rtl::isAsciiWhiteSpace(cChar) )
1496                     break;
1497                 nPos++;
1498             }
1499 
1500             // Option with value?
1501             if( nPos!=aToken.getLength() && '='==cChar )
1502             {
1503                 nPos++;
1504 
1505                 while( nPos < aToken.getLength() )
1506                 {
1507                     cChar = aToken[nPos];
1508                     if ( HTML_ISPRINTABLE(cChar) && ' ' != cChar && '\t' != cChar && '\r' != cChar && '\n' != cChar )
1509                         break;
1510                     nPos++;
1511                 }
1512 
1513                 if( nPos != aToken.getLength() )
1514                 {
1515                     sal_Int32 nLen = 0;
1516                     nStt = nPos;
1517                     if( ('"'==cChar) || '\''==cChar )
1518                     {
1519                         sal_Unicode cEnd = cChar;
1520                         nPos++; nStt++;
1521                         bool bDone = false;
1522                         bool bEscape = false;
1523                         while( nPos < aToken.getLength() && !bDone )
1524                         {
1525                             bool bOldEscape = bEscape;
1526                             bEscape = false;
1527                             cChar = aToken[nPos];
1528                             switch( cChar )
1529                             {
1530                             case '\r':
1531                             case '\n':
1532                                 if( bStripCRLF )
1533                                     aToken.remove( nPos, 1 );
1534                                 else
1535                                 {
1536                                     nPos++;
1537                                     nLen++;
1538                                 }
1539                                 break;
1540                             case '\\':
1541                                 if( bOldEscape )
1542                                 {
1543                                     nPos++;
1544                                     nLen++;
1545                                 }
1546                                 else
1547                                 {
1548                                     aToken.remove( nPos, 1 );
1549                                     bEscape = true;
1550                                 }
1551                                 break;
1552                             case '"':
1553                             case '\'':
1554                                 bDone = !bOldEscape && cChar==cEnd;
1555                                 if( !bDone )
1556                                 {
1557                                     nPos++;
1558                                     nLen++;
1559                                 }
1560                                 break;
1561                             default:
1562                                 nPos++;
1563                                 nLen++;
1564                                 break;
1565                             }
1566                         }
1567                         if( nPos!=aToken.getLength() )
1568                             nPos++;
1569                     }
1570                     else
1571                     {
1572                         // More liberal than the standard: allow all printable characters
1573                         bool bEscape = false;
1574                         bool bDone = false;
1575                         while( nPos < aToken.getLength() && !bDone )
1576                         {
1577                             bool bOldEscape = bEscape;
1578                             bEscape = false;
1579                             sal_Unicode c = aToken[nPos];
1580                             switch( c )
1581                             {
1582                             case ' ':
1583                                 bDone = !bOldEscape;
1584                                 if( !bDone )
1585                                 {
1586                                     nPos++;
1587                                     nLen++;
1588                                 }
1589                                 break;
1590 
1591                             case '\t':
1592                             case '\r':
1593                             case '\n':
1594                                 bDone = true;
1595                                 break;
1596 
1597                             case '\\':
1598                                 if( bOldEscape )
1599                                 {
1600                                     nPos++;
1601                                     nLen++;
1602                                 }
1603                                 else
1604                                 {
1605                                     aToken.remove( nPos, 1 );
1606                                     bEscape = true;
1607                                 }
1608                                 break;
1609 
1610                             default:
1611                                 if( HTML_ISPRINTABLE( c ) )
1612                                 {
1613                                     nPos++;
1614                                     nLen++;
1615                                 }
1616                                 else
1617                                     bDone = true;
1618                                 break;
1619                             }
1620                         }
1621                     }
1622 
1623                     if( nLen )
1624                         aValue = aToken.subView( nStt, nLen );
1625                 }
1626             }
1627 
1628             // Token is known and can be saved
1629             maOptions.emplace_back(nToken, sName, aValue);
1630 
1631         }
1632         else
1633             // Ignore white space and unexpected characters
1634             nPos++;
1635     }
1636 
1637     return maOptions;
1638 }
1639 
FilterPRE(HtmlTokenId nToken)1640 HtmlTokenId HTMLParser::FilterPRE( HtmlTokenId nToken )
1641 {
1642     switch( nToken )
1643     {
1644     // in Netscape they only have impact in not empty paragraphs
1645     case HtmlTokenId::PARABREAK_ON:
1646         nToken = HtmlTokenId::LINEBREAK;
1647         [[fallthrough]];
1648     case HtmlTokenId::LINEBREAK:
1649     case HtmlTokenId::NEWPARA:
1650         nPre_LinePos = 0;
1651         if( bPre_IgnoreNewPara )
1652             nToken = HtmlTokenId::NONE;
1653         break;
1654 
1655     case HtmlTokenId::TABCHAR:
1656         {
1657             sal_Int32 nSpaces = 8 - (nPre_LinePos % 8);
1658             DBG_ASSERT( aToken.isEmpty(), "Why is the token not empty?" );
1659             if (aToken.getLength() < nSpaces)
1660             {
1661                 using comphelper::string::padToLength;
1662                 OUStringBuffer aBuf(aToken);
1663                 aToken = padToLength(aBuf, nSpaces, ' ');
1664             }
1665             nPre_LinePos += nSpaces;
1666             nToken = HtmlTokenId::TEXTTOKEN;
1667         }
1668         break;
1669     // Keep those
1670     case HtmlTokenId::TEXTTOKEN:
1671         nPre_LinePos += aToken.getLength();
1672         break;
1673 
1674     case HtmlTokenId::SELECT_ON:
1675     case HtmlTokenId::SELECT_OFF:
1676     case HtmlTokenId::BODY_ON:
1677     case HtmlTokenId::FORM_ON:
1678     case HtmlTokenId::FORM_OFF:
1679     case HtmlTokenId::INPUT:
1680     case HtmlTokenId::OPTION:
1681     case HtmlTokenId::TEXTAREA_ON:
1682     case HtmlTokenId::TEXTAREA_OFF:
1683 
1684     case HtmlTokenId::IMAGE:
1685     case HtmlTokenId::APPLET_ON:
1686     case HtmlTokenId::APPLET_OFF:
1687     case HtmlTokenId::PARAM:
1688     case HtmlTokenId::EMBED:
1689 
1690     case HtmlTokenId::HEAD1_ON:
1691     case HtmlTokenId::HEAD1_OFF:
1692     case HtmlTokenId::HEAD2_ON:
1693     case HtmlTokenId::HEAD2_OFF:
1694     case HtmlTokenId::HEAD3_ON:
1695     case HtmlTokenId::HEAD3_OFF:
1696     case HtmlTokenId::HEAD4_ON:
1697     case HtmlTokenId::HEAD4_OFF:
1698     case HtmlTokenId::HEAD5_ON:
1699     case HtmlTokenId::HEAD5_OFF:
1700     case HtmlTokenId::HEAD6_ON:
1701     case HtmlTokenId::HEAD6_OFF:
1702     case HtmlTokenId::BLOCKQUOTE_ON:
1703     case HtmlTokenId::BLOCKQUOTE_OFF:
1704     case HtmlTokenId::ADDRESS_ON:
1705     case HtmlTokenId::ADDRESS_OFF:
1706     case HtmlTokenId::HORZRULE:
1707 
1708     case HtmlTokenId::CENTER_ON:
1709     case HtmlTokenId::CENTER_OFF:
1710     case HtmlTokenId::DIVISION_ON:
1711     case HtmlTokenId::DIVISION_OFF:
1712 
1713     case HtmlTokenId::SCRIPT_ON:
1714     case HtmlTokenId::SCRIPT_OFF:
1715     case HtmlTokenId::RAWDATA:
1716 
1717     case HtmlTokenId::TABLE_ON:
1718     case HtmlTokenId::TABLE_OFF:
1719     case HtmlTokenId::CAPTION_ON:
1720     case HtmlTokenId::CAPTION_OFF:
1721     case HtmlTokenId::COLGROUP_ON:
1722     case HtmlTokenId::COLGROUP_OFF:
1723     case HtmlTokenId::COL_ON:
1724     case HtmlTokenId::COL_OFF:
1725     case HtmlTokenId::THEAD_ON:
1726     case HtmlTokenId::THEAD_OFF:
1727     case HtmlTokenId::TFOOT_ON:
1728     case HtmlTokenId::TFOOT_OFF:
1729     case HtmlTokenId::TBODY_ON:
1730     case HtmlTokenId::TBODY_OFF:
1731     case HtmlTokenId::TABLEROW_ON:
1732     case HtmlTokenId::TABLEROW_OFF:
1733     case HtmlTokenId::TABLEDATA_ON:
1734     case HtmlTokenId::TABLEDATA_OFF:
1735     case HtmlTokenId::TABLEHEADER_ON:
1736     case HtmlTokenId::TABLEHEADER_OFF:
1737 
1738     case HtmlTokenId::ANCHOR_ON:
1739     case HtmlTokenId::ANCHOR_OFF:
1740     case HtmlTokenId::BOLD_ON:
1741     case HtmlTokenId::BOLD_OFF:
1742     case HtmlTokenId::ITALIC_ON:
1743     case HtmlTokenId::ITALIC_OFF:
1744     case HtmlTokenId::STRIKE_ON:
1745     case HtmlTokenId::STRIKE_OFF:
1746     case HtmlTokenId::STRIKETHROUGH_ON:
1747     case HtmlTokenId::STRIKETHROUGH_OFF:
1748     case HtmlTokenId::UNDERLINE_ON:
1749     case HtmlTokenId::UNDERLINE_OFF:
1750     case HtmlTokenId::BASEFONT_ON:
1751     case HtmlTokenId::BASEFONT_OFF:
1752     case HtmlTokenId::FONT_ON:
1753     case HtmlTokenId::FONT_OFF:
1754     case HtmlTokenId::BLINK_ON:
1755     case HtmlTokenId::BLINK_OFF:
1756     case HtmlTokenId::SPAN_ON:
1757     case HtmlTokenId::SPAN_OFF:
1758     case HtmlTokenId::SUBSCRIPT_ON:
1759     case HtmlTokenId::SUBSCRIPT_OFF:
1760     case HtmlTokenId::SUPERSCRIPT_ON:
1761     case HtmlTokenId::SUPERSCRIPT_OFF:
1762     case HtmlTokenId::BIGPRINT_ON:
1763     case HtmlTokenId::BIGPRINT_OFF:
1764     case HtmlTokenId::SMALLPRINT_OFF:
1765     case HtmlTokenId::SMALLPRINT_ON:
1766 
1767     case HtmlTokenId::EMPHASIS_ON:
1768     case HtmlTokenId::EMPHASIS_OFF:
1769     case HtmlTokenId::CITATION_ON:
1770     case HtmlTokenId::CITATION_OFF:
1771     case HtmlTokenId::STRONG_ON:
1772     case HtmlTokenId::STRONG_OFF:
1773     case HtmlTokenId::CODE_ON:
1774     case HtmlTokenId::CODE_OFF:
1775     case HtmlTokenId::SAMPLE_ON:
1776     case HtmlTokenId::SAMPLE_OFF:
1777     case HtmlTokenId::KEYBOARD_ON:
1778     case HtmlTokenId::KEYBOARD_OFF:
1779     case HtmlTokenId::VARIABLE_ON:
1780     case HtmlTokenId::VARIABLE_OFF:
1781     case HtmlTokenId::DEFINSTANCE_ON:
1782     case HtmlTokenId::DEFINSTANCE_OFF:
1783     case HtmlTokenId::SHORTQUOTE_ON:
1784     case HtmlTokenId::SHORTQUOTE_OFF:
1785     case HtmlTokenId::LANGUAGE_ON:
1786     case HtmlTokenId::LANGUAGE_OFF:
1787     case HtmlTokenId::AUTHOR_ON:
1788     case HtmlTokenId::AUTHOR_OFF:
1789     case HtmlTokenId::PERSON_ON:
1790     case HtmlTokenId::PERSON_OFF:
1791     case HtmlTokenId::ACRONYM_ON:
1792     case HtmlTokenId::ACRONYM_OFF:
1793     case HtmlTokenId::ABBREVIATION_ON:
1794     case HtmlTokenId::ABBREVIATION_OFF:
1795     case HtmlTokenId::INSERTEDTEXT_ON:
1796     case HtmlTokenId::INSERTEDTEXT_OFF:
1797     case HtmlTokenId::DELETEDTEXT_ON:
1798     case HtmlTokenId::DELETEDTEXT_OFF:
1799     case HtmlTokenId::TELETYPE_ON:
1800     case HtmlTokenId::TELETYPE_OFF:
1801 
1802         break;
1803 
1804     // The remainder is treated as an unknown token.
1805     default:
1806         if( nToken != HtmlTokenId::NONE )
1807         {
1808             nToken =
1809                 ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
1810                     ? HtmlTokenId::UNKNOWNCONTROL_OFF
1811                     : HtmlTokenId::UNKNOWNCONTROL_ON );
1812         }
1813         break;
1814     }
1815 
1816     bPre_IgnoreNewPara = false;
1817 
1818     return nToken;
1819 }
1820 
FilterXMP(HtmlTokenId nToken)1821 HtmlTokenId HTMLParser::FilterXMP( HtmlTokenId nToken )
1822 {
1823     switch( nToken )
1824     {
1825     case HtmlTokenId::NEWPARA:
1826         if( bPre_IgnoreNewPara )
1827             nToken = HtmlTokenId::NONE;
1828         [[fallthrough]];
1829     case HtmlTokenId::TEXTTOKEN:
1830     case HtmlTokenId::NONBREAKSPACE:
1831     case HtmlTokenId::SOFTHYPH:
1832         break;              // kept
1833 
1834     default:
1835         if( nToken != HtmlTokenId::NONE )
1836         {
1837             if( (nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken) )
1838             {
1839                 sSaveToken = "</" + sSaveToken;
1840             }
1841             else
1842                 sSaveToken = "<" + sSaveToken;
1843             if( !aToken.isEmpty() )
1844             {
1845                 UnescapeToken();
1846                 sSaveToken += " ";
1847                 aToken.insert(0, sSaveToken);
1848             }
1849             else
1850                 aToken = sSaveToken;
1851             aToken.append( ">" );
1852             nToken = HtmlTokenId::TEXTTOKEN;
1853         }
1854         break;
1855     }
1856 
1857     bPre_IgnoreNewPara = false;
1858 
1859     return nToken;
1860 }
1861 
FilterListing(HtmlTokenId nToken)1862 HtmlTokenId HTMLParser::FilterListing( HtmlTokenId nToken )
1863 {
1864     switch( nToken )
1865     {
1866     case HtmlTokenId::NEWPARA:
1867         if( bPre_IgnoreNewPara )
1868             nToken = HtmlTokenId::NONE;
1869         [[fallthrough]];
1870     case HtmlTokenId::TEXTTOKEN:
1871     case HtmlTokenId::NONBREAKSPACE:
1872     case HtmlTokenId::SOFTHYPH:
1873         break;      // kept
1874 
1875     default:
1876         if( nToken != HtmlTokenId::NONE )
1877         {
1878             nToken =
1879                 ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
1880                     ? HtmlTokenId::UNKNOWNCONTROL_OFF
1881                     : HtmlTokenId::UNKNOWNCONTROL_ON );
1882         }
1883         break;
1884     }
1885 
1886     bPre_IgnoreNewPara = false;
1887 
1888     return nToken;
1889 }
1890 
InternalImgToPrivateURL(OUString & rURL)1891 bool HTMLParser::InternalImgToPrivateURL( OUString& rURL )
1892 {
1893     bool bFound = false;
1894 
1895     if( rURL.startsWith( OOO_STRING_SVTOOLS_HTML_internal_icon ) )
1896     {
1897         OUString aName( rURL.copy(14) );
1898         switch( aName[0] )
1899         {
1900         case 'b':
1901             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata;
1902             break;
1903         case 'd':
1904             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed;
1905             break;
1906         case 'e':
1907             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_embed;
1908             break;
1909         case 'i':
1910             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure;
1911             break;
1912         case 'n':
1913             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound;
1914             break;
1915         }
1916     }
1917     if( bFound )
1918     {
1919         OUString sTmp ( rURL );
1920         rURL =  OOO_STRING_SVTOOLS_HTML_private_image;
1921         rURL += sTmp;
1922     }
1923 
1924     return bFound;
1925 }
1926 
1927 namespace {
1928 
1929 enum class HtmlMeta {
1930     NONE = 0,
1931     Author,
1932     Description,
1933     Keywords,
1934     Refresh,
1935     Classification,
1936     Created,
1937     ChangedBy,
1938     Changed,
1939     Generator,
1940     SDFootnote,
1941     SDEndnote,
1942     ContentType
1943 };
1944 
1945 }
1946 
1947 // <META NAME=xxx>
1948 HTMLOptionEnum<HtmlMeta> const aHTMLMetaNameTable[] =
1949 {
1950     { OOO_STRING_SVTOOLS_HTML_META_author,        HtmlMeta::Author        },
1951     { OOO_STRING_SVTOOLS_HTML_META_changed,       HtmlMeta::Changed       },
1952     { OOO_STRING_SVTOOLS_HTML_META_changedby,     HtmlMeta::ChangedBy     },
1953     { OOO_STRING_SVTOOLS_HTML_META_classification,HtmlMeta::Classification},
1954     { OOO_STRING_SVTOOLS_HTML_META_content_type,  HtmlMeta::ContentType   },
1955     { OOO_STRING_SVTOOLS_HTML_META_created,       HtmlMeta::Created       },
1956     { OOO_STRING_SVTOOLS_HTML_META_description,   HtmlMeta::Description   },
1957     { OOO_STRING_SVTOOLS_HTML_META_keywords,      HtmlMeta::Keywords      },
1958     { OOO_STRING_SVTOOLS_HTML_META_generator,     HtmlMeta::Generator     },
1959     { OOO_STRING_SVTOOLS_HTML_META_refresh,       HtmlMeta::Refresh       },
1960     { OOO_STRING_SVTOOLS_HTML_META_sdendnote,     HtmlMeta::SDEndnote     },
1961     { OOO_STRING_SVTOOLS_HTML_META_sdfootnote,    HtmlMeta::SDFootnote    },
1962     { nullptr,                                    HtmlMeta(0)             }
1963 };
1964 
1965 
AddMetaUserDefined(OUString const &)1966 void HTMLParser::AddMetaUserDefined( OUString const & )
1967 {
1968 }
1969 
ParseMetaOptionsImpl(const uno::Reference<document::XDocumentProperties> & i_xDocProps,SvKeyValueIterator * i_pHTTPHeader,const HTMLOptions & aOptions,rtl_TextEncoding & o_rEnc)1970 bool HTMLParser::ParseMetaOptionsImpl(
1971         const uno::Reference<document::XDocumentProperties> & i_xDocProps,
1972         SvKeyValueIterator *i_pHTTPHeader,
1973         const HTMLOptions& aOptions,
1974         rtl_TextEncoding& o_rEnc )
1975 {
1976     OUString aName, aContent;
1977     HtmlMeta nAction = HtmlMeta::NONE;
1978     bool bHTTPEquiv = false, bChanged = false;
1979 
1980     for ( size_t i = aOptions.size(); i; )
1981     {
1982         const HTMLOption& aOption = aOptions[--i];
1983         switch ( aOption.GetToken() )
1984         {
1985             case HtmlOptionId::NAME:
1986                 aName = aOption.GetString();
1987                 if ( HtmlMeta::NONE==nAction )
1988                 {
1989                     aOption.GetEnum( nAction, aHTMLMetaNameTable );
1990                 }
1991                 break;
1992             case HtmlOptionId::HTTPEQUIV:
1993                 aName = aOption.GetString();
1994                 aOption.GetEnum( nAction, aHTMLMetaNameTable );
1995                 bHTTPEquiv = true;
1996                 break;
1997             case HtmlOptionId::CONTENT:
1998                 aContent = aOption.GetString();
1999                 break;
2000             case HtmlOptionId::CHARSET:
2001             {
2002                 OString sValue(OUStringToOString(aOption.GetString(), RTL_TEXTENCODING_ASCII_US));
2003                 o_rEnc = GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue.getStr()));
2004                 break;
2005             }
2006             default: break;
2007         }
2008     }
2009 
2010     if ( bHTTPEquiv || HtmlMeta::Description != nAction )
2011     {
2012         // if it is not a Description, remove CRs and LFs from CONTENT
2013         aContent = aContent.replaceAll("\r", "").replaceAll("\n", "");
2014     }
2015     else
2016     {
2017         // convert line endings for Description
2018         aContent = convertLineEnd(aContent, GetSystemLineEnd());
2019     }
2020 
2021     if ( bHTTPEquiv && i_pHTTPHeader )
2022     {
2023         // Netscape seems to just ignore a closing ", so we do too
2024         if ( aContent.endsWith("\"") )
2025         {
2026             aContent = aContent.copy( 0, aContent.getLength() - 1 );
2027         }
2028         SvKeyValue aKeyValue( aName, aContent );
2029         i_pHTTPHeader->Append( aKeyValue );
2030     }
2031 
2032     switch ( nAction )
2033     {
2034         case HtmlMeta::Author:
2035             if (i_xDocProps.is()) {
2036                 i_xDocProps->setAuthor( aContent );
2037                 bChanged = true;
2038             }
2039             break;
2040         case HtmlMeta::Description:
2041             if (i_xDocProps.is()) {
2042                 i_xDocProps->setDescription( aContent );
2043                 bChanged = true;
2044             }
2045             break;
2046         case HtmlMeta::Keywords:
2047             if (i_xDocProps.is()) {
2048                 i_xDocProps->setKeywords(
2049                     ::comphelper::string::convertCommaSeparated(aContent));
2050                 bChanged = true;
2051             }
2052             break;
2053         case HtmlMeta::Classification:
2054             if (i_xDocProps.is()) {
2055                 i_xDocProps->setSubject( aContent );
2056                 bChanged = true;
2057             }
2058             break;
2059 
2060         case HtmlMeta::ChangedBy:
2061             if (i_xDocProps.is()) {
2062                 i_xDocProps->setModifiedBy( aContent );
2063                 bChanged = true;
2064             }
2065             break;
2066 
2067         case HtmlMeta::Created:
2068         case HtmlMeta::Changed:
2069             if (i_xDocProps.is() && !aContent.isEmpty())
2070             {
2071                 ::util::DateTime uDT;
2072                 bool valid = false;
2073                 if (comphelper::string::getTokenCount(aContent, ';') == 2)
2074                 {
2075                     sal_Int32 nIdx{ 0 };
2076                     sal_Int32 nDate = o3tl::toInt32(o3tl::getToken(aContent, 0, ';', nIdx));
2077                     sal_Int64 nTime = o3tl::toInt64(o3tl::getToken(aContent, 0, ';', nIdx));
2078                     valid = nDate != std::numeric_limits<sal_Int32>::min() &&
2079                             nTime != std::numeric_limits<sal_Int64>::min();
2080                     if (valid)
2081                     {
2082                         Date aDate(nDate);
2083                         tools::Time aTime(nTime);
2084                         uDT = DateTime(aDate, aTime).GetUNODateTime();
2085                     }
2086                 }
2087                 else if (utl::ISO8601parseDateTime(aContent, uDT))
2088                     valid = true;
2089 
2090                 if (valid)
2091                 {
2092                     bChanged = true;
2093                     if (HtmlMeta::Created == nAction)
2094                         i_xDocProps->setCreationDate(uDT);
2095                     else
2096                         i_xDocProps->setModificationDate(uDT);
2097                 }
2098             }
2099             break;
2100 
2101         case HtmlMeta::Refresh:
2102             DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader, "Lost Reload-URL because of omitted MUST change." );
2103             break;
2104 
2105         case HtmlMeta::ContentType:
2106             if ( !aContent.isEmpty() )
2107             {
2108                 o_rEnc = GetEncodingByMIME( aContent );
2109             }
2110             break;
2111 
2112         case HtmlMeta::NONE:
2113             if ( !bHTTPEquiv )
2114             {
2115                 if (i_xDocProps.is())
2116                 {
2117                     uno::Reference<beans::XPropertyContainer> xUDProps
2118                         = i_xDocProps->getUserDefinedProperties();
2119                     try {
2120                         xUDProps->addProperty(aName,
2121                             beans::PropertyAttribute::REMOVABLE,
2122                             uno::Any(aContent));
2123                         AddMetaUserDefined(aName);
2124                         bChanged = true;
2125                     } catch (uno::Exception &) {
2126                         // ignore
2127                     }
2128                 }
2129             }
2130             break;
2131         default:
2132             break;
2133     }
2134 
2135     return bChanged;
2136 }
2137 
ParseMetaOptions(const uno::Reference<document::XDocumentProperties> & i_xDocProps,SvKeyValueIterator * i_pHeader)2138 bool HTMLParser::ParseMetaOptions(
2139         const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2140         SvKeyValueIterator *i_pHeader )
2141 {
2142     HtmlOptionId nContentOption = HtmlOptionId::CONTENT;
2143     rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
2144 
2145     bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
2146                       GetOptions(&nContentOption),
2147                       eEnc );
2148 
2149     // If the encoding is set by a META tag, it may only overwrite the
2150     // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2151     // encodings. Everything else cannot lead to reasonable results.
2152     if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
2153         rtl_isOctetTextEncoding( eEnc ) &&
2154         rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2155     {
2156         eEnc = GetExtendedCompatibilityTextEncoding( eEnc );
2157         SetSrcEncoding( eEnc );
2158     }
2159 
2160     return bRet;
2161 }
2162 
GetEncodingByMIME(const OUString & rMime)2163 rtl_TextEncoding HTMLParser::GetEncodingByMIME( const OUString& rMime )
2164 {
2165     OUString sType;
2166     OUString sSubType;
2167     INetContentTypeParameterList aParameters;
2168     if (INetContentTypes::parse(rMime, sType, sSubType, &aParameters))
2169     {
2170         auto const iter = aParameters.find("charset"_ostr);
2171         if (iter != aParameters.end())
2172         {
2173             const INetContentTypeParameter * pCharset = &iter->second;
2174             OString sValue(OUStringToOString(pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US));
2175             return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue.getStr() ) );
2176         }
2177     }
2178     return RTL_TEXTENCODING_DONTKNOW;
2179 }
2180 
GetEncodingByHttpHeader(SvKeyValueIterator * pHTTPHeader)2181 rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
2182 {
2183     rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
2184     if( pHTTPHeader )
2185     {
2186         SvKeyValue aKV;
2187         for( bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
2188              bCont = pHTTPHeader->GetNext( aKV ) )
2189         {
2190             if( aKV.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
2191             {
2192                 if( !aKV.GetValue().isEmpty() )
2193                 {
2194                     eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
2195                 }
2196             }
2197         }
2198     }
2199     return eRet;
2200 }
2201 
SetEncodingByHTTPHeader(SvKeyValueIterator * pHTTPHeader)2202 bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader )
2203 {
2204     bool bRet = false;
2205     rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
2206     if(RTL_TEXTENCODING_DONTKNOW != eEnc)
2207     {
2208         SetSrcEncoding( eEnc );
2209         bRet = true;
2210     }
2211     return bRet;
2212 }
2213 
2214 
2215 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
2216