xref: /core/sc/source/ui/unoobj/scdetect.cxx (revision 592947c1a421eae46d6268798cf3eab6bc6f0dc6)
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include "scdetect.hxx"
21 
22 #include <sal/macros.h>
23 
24 #include <com/sun/star/beans/PropertyValue.hpp>
25 #include <com/sun/star/uno/XComponentContext.hpp>
26 #include <cppuhelper/supportsservice.hxx>
27 #include <com/sun/star/io/XInputStream.hpp>
28 #include <unotools/mediadescriptor.hxx>
29 #include <sfx2/docfile.hxx>
30 #include <sfx2/docfilt.hxx>
31 #include <sfx2/fcontnr.hxx>
32 
33 using namespace ::com::sun::star;
34 using utl::MediaDescriptor;
35 
36 namespace {
37 
38 // table with search pattern
39 // meaning of the sequences
40 // 0x00??: the exact byte 0x?? must be at that place
41 // 0x0100: read over a byte (don't care)
42 // 0x02nn: a byte of 0xnn variations follows
43 // 0x8000: recognition finished
44 
45 #define M_DC        0x0100
46 #define M_ALT(CNT)  (0x0200+(CNT))
47 #define M_END       0x8000
48 
49 const sal_uInt16 pLotus[] =      // Lotus 1/1A/2
50     { 0x0000, 0x0000, 0x0002, 0x0000,
51     M_ALT(2), 0x0004, 0x0006,
52     0x0004, M_END };
53 
54 const sal_uInt16 pLotusNew[] =   // Lotus >= 9.7
55     { 0x0000, 0x0000, M_DC, 0x0000,     // Rec# + Len (0x1a)
56       M_ALT(3), 0x0003, 0x0004, 0x0005, // File Revision Code 97->ME
57       0x0010, 0x0004, 0x0000, 0x0000,
58       M_END };
59 
60 const sal_uInt16 pLotus2[] =     // Lotus >3
61     { 0x0000, 0x0000, 0x001A, 0x0000,   // Rec# + Len (26)
62     M_ALT(2), 0x0000, 0x0002,         // File Revision Code
63     0x0010,
64     0x0004, 0x0000,                   // File Revision Subcode
65     M_END };
66 
67 const sal_uInt16 pQPro[] =
68        { 0x0000, 0x0000, 0x0002, 0x0000,
69          M_ALT(4), 0x0001, 0x0002, // WB1, WB2
70          0x0006, 0x0007,           // QPro 6/7 (?)
71          0x0010,
72          M_END };
73 
74 const sal_uInt16 pDIF1[] =       // DIF with CR-LF
75     {
76     'T', 'A', 'B', 'L', 'E',
77     M_DC, M_DC,
78     '0', ',', '1',
79     M_DC, M_DC,
80     '\"',
81     M_END };
82 
83 const sal_uInt16 pDIF2[] =       // DIF with CR or LF
84     {
85     'T', 'A', 'B', 'L', 'E',
86     M_DC,
87     '0', ',', '1',
88     M_DC,
89     '\"',
90     M_END };
91 
92 const sal_uInt16 pSylk[] =       // Sylk
93     {
94     'I', 'D', ';',
95     M_ALT(3), 'P', 'N', 'E',        // 'P' plus undocumented Excel extensions 'N' and 'E'
96     M_END };
97 
detectThisFormat(SvStream & rStr,const sal_uInt16 * pSearch)98 bool detectThisFormat(SvStream& rStr, const sal_uInt16* pSearch)
99 {
100     sal_uInt8 nByte;
101     rStr.Seek( 0 ); // in the beginning everything was bad...
102     rStr.ReadUChar( nByte );
103     bool bSync = true;
104     while( !rStr.eof() && bSync )
105     {
106         sal_uInt16 nMuster = *pSearch;
107 
108         if( nMuster < 0x0100 )
109         { // compare bytes
110             if( static_cast<sal_uInt8>(nMuster) != nByte )
111                 bSync = false;
112         }
113         else if( nMuster & M_DC )
114         { // don't care
115         }
116         else if( nMuster & M_ALT(0) )
117         { // alternative Bytes
118             sal_uInt8 nCntAlt = static_cast<sal_uInt8>(nMuster);
119             bSync = false;          // first unsynchron
120             while( nCntAlt > 0 )
121             {
122                 pSearch++;
123                 if( static_cast<sal_uInt8>(*pSearch) == nByte )
124                     bSync = true;   // only now synchronization
125                 nCntAlt--;
126             }
127         }
128         else if( nMuster & M_END )
129         { // Format detected
130             return true;
131         }
132 
133         pSearch++;
134         rStr.ReadUChar( nByte );
135     }
136 
137     return false;
138 }
139 
140 }
141 
ScFilterDetect()142 ScFilterDetect::ScFilterDetect()
143 {
144 }
145 
~ScFilterDetect()146 ScFilterDetect::~ScFilterDetect()
147 {
148 }
149 
150 #if 0
151 // This method is no longer used, but I do want to keep this for now to see
152 // if we could transfer this check to the now centralized ascii detection
153 // code in the filter module.
154 static sal_Bool lcl_MayBeAscii( SvStream& rStream )
155 {
156     // ASCII/CSV is considered possible if there are no null bytes, or a Byte
157     // Order Mark is present, or if, for Unicode UCS2/UTF-16, all null bytes
158     // are on either even or uneven byte positions.
159 
160     rStream.Seek(STREAM_SEEK_TO_BEGIN);
161 
162     const size_t nBufSize = 2048;
163     sal_uInt16 aBuffer[ nBufSize ];
164     sal_uInt8* pByte = reinterpret_cast<sal_uInt8*>(aBuffer);
165     sal_uLong nBytesRead = rStream.Read( pByte, nBufSize*2);
166 
167     if ( nBytesRead >= 2 && (aBuffer[0] == 0xfffe || aBuffer[0] == 0xfeff) )
168     {
169         // Unicode BOM file may contain null bytes.
170         return sal_True;
171     }
172 
173     const sal_uInt16* p = aBuffer;
174     sal_uInt16 nMask = 0xffff;
175     nBytesRead /= 2;
176     while( nBytesRead-- && nMask )
177     {
178         sal_uInt16 nVal = *p++ & nMask;
179         if (!(nVal & 0x00ff))
180             nMask &= 0xff00;
181         if (!(nVal & 0xff00))
182             nMask &= 0x00ff;
183     }
184 
185     return nMask != 0;
186 }
187 #endif
188 
lcl_MayBeDBase(SvStream & rStream)189 static bool lcl_MayBeDBase( SvStream& rStream )
190 {
191     // Look for dbf marker, see connectivity/source/inc/dbase/DTable.hxx
192     // DBFType for values.
193     const sal_uInt8 nValidMarks[] = {
194         0x03, 0x04, 0x05, 0x30, 0x31, 0x43, 0xB3, 0x83, 0x8b, 0x8e, 0xf5 };
195     sal_uInt8 nMark;
196     rStream.Seek(STREAM_SEEK_TO_BEGIN);
197     rStream.ReadUChar( nMark );
198     bool bValidMark = false;
199     for (size_t i=0; i < SAL_N_ELEMENTS(nValidMarks) && !bValidMark; ++i)
200     {
201         if (nValidMarks[i] == nMark)
202             bValidMark = true;
203     }
204     if ( !bValidMark )
205         return false;
206 
207     const size_t nHeaderBlockSize = 32;
208     // Empty dbf is >= 32*2+1 bytes in size.
209     const size_t nEmptyDbf = nHeaderBlockSize * 2 + 1;
210 
211     sal_uInt64 nSize = rStream.TellEnd();
212     if ( nSize < nEmptyDbf )
213         return false;
214 
215     // count of records at 4
216     rStream.Seek(4);
217     sal_uInt32 nRecords(0);
218     rStream.ReadUInt32(nRecords);
219 
220     // length of header starts at 8
221     rStream.Seek(8);
222     sal_uInt16 nHeaderLen;
223     rStream.ReadUInt16( nHeaderLen );
224 
225     // size of record at 10
226     sal_uInt16 nRecordSize(0);
227     rStream.ReadUInt16(nRecordSize);
228 
229     if ( nHeaderLen < nEmptyDbf || nSize < nHeaderLen )
230         return false;
231 
232     // see DTable.cxx ODbaseTable::readHeader()
233     if (0 == nRecordSize)
234         return false;
235 
236     // see DTable.cxx ODbaseTable::construct() line 546
237     if (0 == nRecords)
238     {
239         nRecords = (nSize - nHeaderLen) / nRecordSize;
240     }
241 
242     // tdf#84834 sanity check of size
243     // tdf#106423: a dbf file can have 0 record, so no need to check nRecords
244     if (nSize < nHeaderLen + nRecords * sal_uInt64(nRecordSize))
245         return false;
246 
247     // Last byte of header must be 0x0d, this is how it's specified.
248     // #i9581#,#i26407# but some applications don't follow the specification
249     // and pad the header with one byte 0x00 to reach an
250     // even boundary. Some (#i88577# ) even pad more or pad using a 0x1a ^Z
251     // control character (#i8857#). This results in:
252     // Last byte of header must be 0x0d on 32 bytes boundary.
253     sal_uInt16 nBlocks = (nHeaderLen - 1) / nHeaderBlockSize;
254     sal_uInt8 nEndFlag = 0;
255     while ( nBlocks > 1 && nEndFlag != 0x0d ) {
256         rStream.Seek( nBlocks-- * nHeaderBlockSize );
257         rStream.ReadUChar( nEndFlag );
258     }
259 
260     return ( 0x0d == nEndFlag );
261 }
262 
detect(uno::Sequence<beans::PropertyValue> & lDescriptor)263 OUString SAL_CALL ScFilterDetect::detect( uno::Sequence<beans::PropertyValue>& lDescriptor )
264 {
265     MediaDescriptor aMediaDesc( lDescriptor );
266     OUString aTypeName = aMediaDesc.getUnpackedValueOrDefault( MediaDescriptor::PROP_TYPENAME, OUString() );
267     uno::Reference< io::XInputStream > xStream ( aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM], uno::UNO_QUERY );
268     if ( !xStream.is() )
269         return OUString();
270 
271     SfxMedium aMedium;
272     aMedium.UseInteractionHandler( false );
273     aMedium.setStreamToLoadFrom( xStream, true );
274 
275     SvStream* pStream = aMedium.GetInStream();
276     if ( !pStream || pStream->GetError() )
277         // No stream, no detection.
278         return OUString();
279 
280     const char* pSearchFilterName = nullptr;
281     if (aTypeName == "calc_Lotus")
282     {
283         if (!detectThisFormat(*pStream, pLotus) && !detectThisFormat(*pStream, pLotusNew) && !detectThisFormat(*pStream, pLotus2))
284             return OUString();
285 
286         pSearchFilterName = "Lotus";
287     }
288     else if (aTypeName == "calc_QPro")
289     {
290         if (!detectThisFormat(*pStream, pQPro))
291             return OUString();
292 
293         pSearchFilterName = "Quattro Pro 6.0";
294     }
295     else if (aTypeName == "calc_SYLK")
296     {
297         if (!detectThisFormat(*pStream, pSylk))
298             return OUString();
299 
300         pSearchFilterName = "SYLK";
301     }
302     else if (aTypeName == "calc_DIF")
303     {
304         if (!detectThisFormat(*pStream, pDIF1) && !detectThisFormat(*pStream, pDIF2))
305             return OUString();
306 
307         pSearchFilterName = "DIF";
308     }
309     else if (aTypeName == "calc_dBase")
310     {
311         if (!lcl_MayBeDBase(*pStream))
312             return OUString();
313 
314         pSearchFilterName = "dBase";
315     }
316     else
317         return OUString();
318 
319     SfxFilterMatcher aMatcher(u"scalc"_ustr);
320     std::shared_ptr<const SfxFilter> pFilter = aMatcher.GetFilter4FilterName(OUString::createFromAscii(pSearchFilterName));
321 
322     if (!pFilter)
323         return OUString();
324 
325     aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= pFilter->GetName();
326     aMediaDesc >> lDescriptor;
327     return aTypeName;
328 }
329 
getImplementationName()330 OUString SAL_CALL ScFilterDetect::getImplementationName()
331 {
332     return u"com.sun.star.comp.calc.FormatDetector"_ustr;
333 }
334 
supportsService(const OUString & sServiceName)335 sal_Bool ScFilterDetect::supportsService( const OUString& sServiceName )
336 {
337     return cppu::supportsService(this, sServiceName);
338 }
339 
getSupportedServiceNames()340 css::uno::Sequence<OUString> ScFilterDetect::getSupportedServiceNames()
341 {
342     return { u"com.sun.star.frame.ExtendedTypeDetection"_ustr };
343 }
344 
345 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
com_sun_star_comp_calc_FormatDetector_get_implementation(css::uno::XComponentContext *,css::uno::Sequence<css::uno::Any> const &)346 com_sun_star_comp_calc_FormatDetector_get_implementation(css::uno::XComponentContext* /*context*/,
347                                                          css::uno::Sequence<css::uno::Any> const &)
348 {
349     return cppu::acquire(new ScFilterDetect);
350 }
351 
352 
353 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
354