1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  */
9 
10 #include "filterdetect.hxx"
11 
12 #include <svtools/htmltokn.h>
13 #include <tools/urlobj.hxx>
14 #include <tools/zcodec.hxx>
15 #include <ucbhelper/content.hxx>
16 #include <unotools/mediadescriptor.hxx>
17 #include <unotools/streamwrap.hxx>
18 #include <unotools/ucbstreamhelper.hxx>
19 
20 #include <com/sun/star/io/XInputStream.hpp>
21 #include <cppuhelper/supportsservice.hxx>
22 #include <memory>
23 
24 constexpr OUString WRITER_TEXT_FILTER = u"Text"_ustr;
25 constexpr OUString CALC_TEXT_FILTER = u"Text - txt - csv (StarCalc)"_ustr;
26 
27 constexpr OUStringLiteral WEB_HTML_FILTER = u"HTML";
28 constexpr OUStringLiteral WRITER_HTML_FILTER = u"HTML (StarWriter)";
29 constexpr OUStringLiteral CALC_HTML_FILTER = u"calc_HTML_WebQuery";
30 
31 constexpr OUString WRITER_DOCSERVICE = u"com.sun.star.text.TextDocument"_ustr;
32 constexpr OUString CALC_DOCSERVICE = u"com.sun.star.sheet.SpreadsheetDocument"_ustr;
33 
34 using namespace ::com::sun::star;
35 using utl::MediaDescriptor;
36 
37 namespace {
38 
IsHTMLStream(const uno::Reference<io::XInputStream> & xInStream)39 bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
40 {
41     std::unique_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) );
42     if ( !pInStream || pInStream->GetError() )
43         // No stream
44         return false;
45 
46     // Read the stream header
47     pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
48     const sal_uInt64 nUniPos = pInStream->Tell();
49     const sal_uInt16 nSize = 4096;
50 
51     OString sHeader;
52     if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode
53         sHeader = read_uInt8s_ToOString( *pInStream, nSize );
54     else // UTF-16 (nUniPos = 2)
55         sHeader = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US );
56 
57     // Now check whether the stream begins with a known HTML tag.
58     enum DetectPhase { BeforeTag, TagOpened, InTagName };
59     DetectPhase dp = BeforeTag;
60     /// BeforeDeclaration -> ? -> DeclarationOpened -> > -> BeforeDeclaration.
61     enum DeclarationPhase
62     {
63         BeforeDeclaration,
64         DeclarationOpened
65     };
66     DeclarationPhase eDeclaration = BeforeDeclaration;
67 
68     const char* pHeader = sHeader.getStr();
69     const int   nLength = sHeader.getLength();
70     int i = 0, nStartOfTagIndex = 0;
71 
72     for ( i = 0; i < nLength; ++i, ++pHeader )
73     {
74         char c = *pHeader;
75         if ((c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f')
76             && eDeclaration == BeforeDeclaration)
77         {
78             if ( dp == TagOpened )
79                 return false; // Invalid: Should start with a tag name
80             else if ( dp == InTagName )
81                 break; // End of tag name reached
82         }
83         else if ( c == '<' )
84         {
85             if ( dp == BeforeTag )
86                 dp = TagOpened;
87             else
88                 return false; // Invalid: Nested '<'
89         }
90         else if ( c == '>' )
91         {
92             if ( dp == InTagName )
93                 break; // End of tag name reached
94             else if (eDeclaration == DeclarationOpened)
95             {
96                 dp = BeforeTag;
97                 eDeclaration = BeforeDeclaration;
98             }
99             else
100                 return false; // Invalid: Empty tag or before '<'
101         }
102         else if ( c == '!' )
103         {
104             if ( dp == TagOpened )
105                 return true; // "<!" - DOCTYPE or comments block
106             else
107                 return false; // Invalid: '!' before '<' or inside tag name
108         }
109         else
110         {
111             if ( dp == BeforeTag )
112                 return false; // Invalid: Should start with a tag
113             else if ( dp == TagOpened )
114             {
115                 if (c == '?' && eDeclaration == BeforeDeclaration)
116                     eDeclaration = DeclarationOpened;
117                 else if (eDeclaration == BeforeDeclaration)
118                 {
119                     nStartOfTagIndex = i;
120                     dp = InTagName;
121                 }
122             }
123         }
124     }
125 
126     // The string following '<' has to be a known HTML token.
127     OString aToken = sHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex );
128     return GetHTMLToken( OStringToOUString( aToken.toAsciiLowerCase(), RTL_TEXTENCODING_ASCII_US ) ) != HtmlTokenId::NONE;
129 }
130 }
131 
PlainTextFilterDetect()132 PlainTextFilterDetect::PlainTextFilterDetect() {}
133 
~PlainTextFilterDetect()134 PlainTextFilterDetect::~PlainTextFilterDetect() {}
135 
detect(uno::Sequence<beans::PropertyValue> & lDescriptor)136 OUString SAL_CALL PlainTextFilterDetect::detect(uno::Sequence<beans::PropertyValue>& lDescriptor)
137 {
138     MediaDescriptor aMediaDesc(lDescriptor);
139 
140     OUString aType = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME, OUString() );
141     OUString aDocService = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_DOCUMENTSERVICE, OUString() );
142 
143     if ((aType == "generic_HTML") || (aType == "calc_HTML"))
144     {
145         uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM], uno::UNO_QUERY);
146         if (!xInStream.is() || !IsHTMLStream(xInStream))
147             return OUString();
148 
149         if ((aDocService == CALC_DOCSERVICE) || (aType == "calc_HTML"))
150             aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(CALC_HTML_FILTER);
151         else if (aDocService == WRITER_DOCSERVICE)
152             aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WRITER_HTML_FILTER);
153         else
154             aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WEB_HTML_FILTER);
155     }
156 
157     else if (aType == "generic_Text")
158     {
159         uno::Reference<io::XStream> xStream(aMediaDesc[MediaDescriptor::PROP_STREAM], uno::UNO_QUERY);
160         uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM], uno::UNO_QUERY);
161         if (xStream.is() || xInStream.is())
162         {
163             ZCodec aCodecGZ;
164             std::unique_ptr<SvStream> pInStream;
165             if (xStream.is())
166                 pInStream = utl::UcbStreamHelper::CreateStream(xStream);
167             else
168                 pInStream = utl::UcbStreamHelper::CreateStream(xInStream);
169             std::unique_ptr<SvMemoryStream> pDecompressedStream(new SvMemoryStream());
170             if (aCodecGZ.AttemptDecompression(*pInStream, *pDecompressedStream))
171             {
172                 uno::Reference<io::XStream> xStreamDecompressed(new utl::OStreamWrapper(std::move(pDecompressedStream)));
173                 aMediaDesc[MediaDescriptor::PROP_STREAM] <<= xStreamDecompressed;
174                 aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM] <<= xStreamDecompressed->getInputStream();
175                 OUString aURL = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL, OUString() );
176                 sal_Int32 nIdx = aURL.lastIndexOf(".gz");
177                 if (nIdx != -1)
178                     aMediaDesc[MediaDescriptor::PROP_URL] <<= aURL.copy(0, nIdx);
179             }
180         }
181         // Get the file name extension.
182         INetURLObject aParser(aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL, OUString() ) );
183         OUString aExt = aParser.getExtension(INetURLObject::LAST_SEGMENT, true, INetURLObject::DecodeMechanism::WithCharset);
184         aExt = aExt.toAsciiLowerCase();
185         OUString aName = aParser.getName().toAsciiLowerCase();
186 
187         // Decide which filter to use based on the document service first,
188         // then on extension if that's not available.
189 
190         if (aDocService == CALC_DOCSERVICE)
191             aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= CALC_TEXT_FILTER;
192         else if (aDocService == WRITER_DOCSERVICE)
193             aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= WRITER_TEXT_FILTER;
194         else if (aExt == "csv" || aExt == "tsv" || aExt == "tab" || aExt == "xls" || aName.endsWith(".csv.gz"))
195             aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= CALC_TEXT_FILTER;
196         else
197             aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= WRITER_TEXT_FILTER;
198     }
199 
200     else
201         // Nothing to detect.
202         return OUString();
203 
204     aMediaDesc >> lDescriptor;
205     return aType;
206 }
207 
208 // XInitialization
209 
initialize(const uno::Sequence<uno::Any> &)210 void SAL_CALL PlainTextFilterDetect::initialize(const uno::Sequence<uno::Any>& /*aArguments*/)
211 {
212 }
213 
PlainTextFilterDetect_getImplementationName()214 OUString PlainTextFilterDetect_getImplementationName()
215 {
216     return u"com.sun.star.comp.filters.PlainTextFilterDetect"_ustr;
217 }
218 
PlainTextFilterDetect_getSupportedServiceNames()219 uno::Sequence<OUString> PlainTextFilterDetect_getSupportedServiceNames()
220 {
221     return { u"com.sun.star.document.ExtendedTypeDetection"_ustr, u"com.sun.star.comp.filters.PlainTextFilterDetect"_ustr };
222 }
223 
224 // XServiceInfo
getImplementationName()225 OUString SAL_CALL PlainTextFilterDetect::getImplementationName()
226 {
227     return PlainTextFilterDetect_getImplementationName();
228 }
229 
supportsService(const OUString & rServiceName)230 sal_Bool SAL_CALL PlainTextFilterDetect::supportsService(const OUString& rServiceName)
231 {
232     return cppu::supportsService(this, rServiceName);
233 }
234 
getSupportedServiceNames()235 uno::Sequence<OUString> SAL_CALL PlainTextFilterDetect::getSupportedServiceNames()
236 {
237     return PlainTextFilterDetect_getSupportedServiceNames();
238 }
239 
240 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
com_sun_star_comp_filters_PlainTextFilterDetect_get_implementation(css::uno::XComponentContext *,css::uno::Sequence<css::uno::Any> const &)241 com_sun_star_comp_filters_PlainTextFilterDetect_get_implementation(css::uno::XComponentContext* ,
242                                                                    css::uno::Sequence<css::uno::Any> const &)
243 {
244     return cppu::acquire(new PlainTextFilterDetect);
245 }
246 
247 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
248