1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include "typedetection.hxx"
21 #include "constant.hxx"
22 
23 #include <com/sun/star/document/XExtendedFilterDetection.hpp>
24 #include <com/sun/star/frame/Desktop.hpp>
25 #include <com/sun/star/util/URLTransformer.hpp>
26 #include <com/sun/star/util/XURLTransformer.hpp>
27 
28 #include <com/sun/star/io/XInputStream.hpp>
29 #include <com/sun/star/io/XSeekable.hpp>
30 #include <com/sun/star/task/XInteractionHandler.hpp>
31 #include <tools/wldcrd.hxx>
32 #include <rtl/ustrbuf.hxx>
33 #include <sal/log.hxx>
34 #include <framework/interaction.hxx>
35 #include <tools/diagnose_ex.h>
36 #include <tools/urlobj.hxx>
37 #include <comphelper/fileurl.hxx>
38 #include <comphelper/processfactory.hxx>
39 #include <comphelper/sequence.hxx>
40 
41 #define DEBUG_TYPE_DETECTION 0
42 
43 #if DEBUG_TYPE_DETECTION
44 #include <iostream>
45 using std::cout;
46 using std::endl;
47 #endif
48 
49 using namespace com::sun::star;
50 
51 namespace filter{
52     namespace config{
53 
54 TypeDetection::TypeDetection(const css::uno::Reference< css::uno::XComponentContext >& rxContext)
55    : m_xContext(rxContext)
56    , m_xTerminateListener(new TerminateDetection(this))
57    , m_bCancel(false)
58 {
59     css::frame::Desktop::create(m_xContext)->addTerminateListener(m_xTerminateListener.get());
60     BaseContainer::init(rxContext                                     ,
61                         TypeDetection::impl_getImplementationName()   ,
62                         TypeDetection::impl_getSupportedServiceNames(),
63                         FilterCache::E_TYPE                           );
64 }
65 
66 
67 TypeDetection::~TypeDetection()
68 {
69     css::frame::Desktop::create(m_xContext)->removeTerminateListener(m_xTerminateListener.get());
70 }
71 
72 
73 OUString SAL_CALL TypeDetection::queryTypeByURL(const OUString& sURL)
74 {
75     OUString sType;
76 
77     // SAFE ->
78     osl::MutexGuard aLock(m_aLock);
79 
80     css::util::URL  aURL;
81     aURL.Complete = sURL;
82     css::uno::Reference< css::util::XURLTransformer > xParser( css::util::URLTransformer::create(m_xContext) );
83     xParser->parseStrict(aURL);
84 
85     // set std types as minimum requirement first!
86     // Only in case no type was found for given URL,
87     // use optional types too ...
88     auto & cache = TheFilterCache::get();
89     FlatDetection lFlatTypes;
90     cache.detectFlatForURL(aURL, lFlatTypes);
91 
92     if (
93         (lFlatTypes.empty()                                ) &&
94         (!cache.isFillState(FilterCache::E_CONTAINS_TYPES))
95        )
96     {
97         cache.load(FilterCache::E_CONTAINS_TYPES);
98         cache.detectFlatForURL(aURL, lFlatTypes);
99     }
100 
101     // first item is guaranteed as "preferred" one!
102     if (!lFlatTypes.empty())
103     {
104         const FlatDetectionInfo& aMatch = *(lFlatTypes.begin());
105         sType = aMatch.sType;
106     }
107 
108     return sType;
109     // <- SAFE
110 }
111 
112 namespace {
113 
114 /**
115  * Rank format types in order of complexity.  More complex formats are
116  * ranked higher so that they get tested sooner over simpler formats.
117  *
118  * Guidelines to determine how complex a format is (subject to change):
119  *
120  * 1) compressed text (XML, HTML, etc)
121  * 2) binary
122  * 3) non-compressed text
123  *   3.1) structured text
124  *     3.1.1) dialect of a structured text (e.g. docbook XML)
125  *     3.1.2) generic structured text (e.g. generic XML)
126  *   3.2) non-structured text
127  *
128  * In each category, rank them from strictly-structured to
129  * loosely-structured.
130  */
131 int getFlatTypeRank(const OUString& rType)
132 {
133     // List formats from more complex to less complex.
134     // TODO: Add more.
135     static const char* ranks[] = {
136 
137         // Compressed XML (ODF XML zip formats)
138         "writer8_template",
139         "writer8",
140         "calc8_template",
141         "calc8",
142         "impress8_template",
143         "impress8",
144         "draw8_template",
145         "draw8",
146         "chart8",
147         "math8",
148         "writerglobal8_template",
149         "writerglobal8",
150         "writerweb8_writer_template",
151         "StarBase",
152 
153         // Compressed XML (OOXML)
154         "writer_OOXML_Text_Template",
155         "writer_OOXML",
156         "writer_MS_Word_2007_Template",
157         "writer_MS_Word_2007",
158         "Office Open XML Spreadsheet Template",
159         "Office Open XML Spreadsheet",
160         "MS Excel 2007 XML Template",
161         "MS Excel 2007 XML",
162         "MS PowerPoint 2007 XML Template",
163         "MS PowerPoint 2007 XML AutoPlay",
164         "MS PowerPoint 2007 XML",
165 
166         // Compressed XML (Uniform/Unified Office Format)
167         "Unified_Office_Format_text",
168         "Unified_Office_Format_spreadsheet",
169         "Unified_Office_Format_presentation",
170 
171         // Compressed XML (StarOffice XML zip formats)
172         "calc_StarOffice_XML_Calc",
173         "calc_StarOffice_XML_Calc_Template",
174         "chart_StarOffice_XML_Chart",
175         "draw_StarOffice_XML_Draw",
176         "draw_StarOffice_XML_Draw_Template",
177         "impress_StarOffice_XML_Impress",
178         "impress_StarOffice_XML_Impress_Template",
179         "math_StarOffice_XML_Math",
180         "writer_StarOffice_XML_Writer",
181         "writer_StarOffice_XML_Writer_Template",
182         "writer_globaldocument_StarOffice_XML_Writer_GlobalDocument",
183         "writer_web_StarOffice_XML_Writer_Web_Template",
184 
185         // Compressed text
186         "pdf_Portable_Document_Format",
187 
188         // Binary
189         "writer_T602_Document",
190         "writer_WordPerfect_Document",
191         "writer_MS_Works_Document",
192         "writer_MS_Word_97_Vorlage",
193         "writer_MS_Word_97",
194         "writer_MS_Word_95_Vorlage",
195         "writer_MS_Word_95",
196         "writer_MS_WinWord_60",
197         "writer_MS_WinWord_5",
198         "MS Excel 2007 Binary",
199         "calc_MS_Excel_97_VorlageTemplate",
200         "calc_MS_Excel_97",
201         "calc_MS_Excel_95_VorlageTemplate",
202         "calc_MS_Excel_95",
203         "calc_MS_Excel_5095_VorlageTemplate",
204         "calc_MS_Excel_5095",
205         "calc_MS_Excel_40_VorlageTemplate",
206         "calc_MS_Excel_40",
207         "calc_Pocket_Excel_File",
208         "impress_MS_PowerPoint_97_Vorlage",
209         "impress_MS_PowerPoint_97_AutoPlay",
210         "impress_MS_PowerPoint_97",
211         "calc_Lotus",
212         "calc_QPro",
213         "calc_SYLK",
214         "calc_DIF",
215         "calc_dBase",
216 
217         // Binary (raster and vector image files)
218         "emf_MS_Windows_Metafile",
219         "wmf_MS_Windows_Metafile",
220         "met_OS2_Metafile",
221         "svm_StarView_Metafile",
222         "sgv_StarDraw_20",
223         "tif_Tag_Image_File",
224         "tga_Truevision_TARGA",
225         "sgf_StarOffice_Writer_SGF",
226         "ras_Sun_Rasterfile",
227         "psd_Adobe_Photoshop",
228         "png_Portable_Network_Graphic",
229         "jpg_JPEG",
230         "mov_MOV",
231         "gif_Graphics_Interchange",
232         "bmp_MS_Windows",
233         "pcx_Zsoft_Paintbrush",
234         "pct_Mac_Pict",
235         "pcd_Photo_CD_Base",
236         "pcd_Photo_CD_Base4",
237         "pcd_Photo_CD_Base16",
238         "impress_CGM_Computer_Graphics_Metafile", // There is binary and ascii variants ?
239         "draw_WordPerfect_Graphics",
240         "draw_Visio_Document",
241         "draw_Publisher_Document",
242         "draw_Corel_Presentation_Exchange",
243         "draw_CorelDraw_Document",
244         "writer_LotusWordPro_Document",
245         "writer_MIZI_Hwp_97", // Hanword (Hancom Office)
246 
247         // Non-compressed XML
248         "writer_ODT_FlatXML",
249         "calc_ODS_FlatXML",
250         "impress_ODP_FlatXML",
251         "draw_ODG_FlatXML",
252         "calc_ADO_rowset_XML",
253         "calc_MS_Excel_2003_XML",
254         "writer_MS_Word_2003_XML",
255         "writer_DocBook_File",
256         "XHTML_File",
257         "svg_Scalable_Vector_Graphics",
258         "math_MathML_XML_Math",
259 
260         // Non-compressed text
261         "dxf_AutoCAD_Interchange",
262         "eps_Encapsulated_PostScript",
263         "pbm_Portable_Bitmap",   // There is 'raw' and 'ascii' variants.
264         "ppm_Portable_Pixelmap", // There is 'raw' and 'ascii' variants.
265         "pgm_Portable_Graymap",  // There is 'raw' and 'ascii' variants.
266         "xpm_XPM",
267         "xbm_X_Consortium",
268         "writer_Rich_Text_Format",
269         "writer_web_HTML_help",
270         "generic_HTML",
271 
272         "generic_Text", // Plain text (catch all)
273 
274         // Anything ranked lower than generic_Text will never be used during
275         // type detection (since generic_Text catches all).
276 
277         // Export only
278         "writer_layout_dump_xml",
279         "graphic_SWF",
280         "graphic_HTML",
281 
282         // Internal use only
283         "StarBaseReportChart",
284         "StarBaseReport",
285         "math_MathType_3x", // MathType equation embedded in Word doc.
286     };
287 
288     size_t n = SAL_N_ELEMENTS(ranks);
289 
290     for (size_t i = 0; i < n; ++i)
291     {
292         if (rType.equalsAscii(ranks[i]))
293             return n - i - 1;
294     }
295 
296     // Not ranked.  Treat them equally.  Unranked formats have higher priority
297     // than the ranked internal ones since they may be defined externally.
298     return n;
299 }
300 
301 /**
302  * Types with matching pattern first, then extension, then custom ranks by
303  * types, then types that are supported by the document service come next.
304  * Lastly, sort them alphabetically.
305  */
306 struct SortByPriority
307 {
308     bool operator() (const FlatDetectionInfo& r1, const FlatDetectionInfo& r2) const
309     {
310         if (r1.bMatchByPattern != r2.bMatchByPattern)
311             return r1.bMatchByPattern;
312 
313         if (r1.bMatchByExtension != r2.bMatchByExtension)
314             return r1.bMatchByExtension;
315 
316         int rank1 = getFlatTypeRank(r1.sType);
317         int rank2 = getFlatTypeRank(r2.sType);
318 
319         if (rank1 != rank2)
320             return rank1 > rank2;
321 
322         if (r1.bPreselectedByDocumentService != r2.bPreselectedByDocumentService)
323             return r1.bPreselectedByDocumentService;
324 
325         // All things being equal, sort them alphabetically.
326         return r1.sType > r2.sType;
327     }
328 };
329 
330 struct SortByType
331 {
332     bool operator() (const FlatDetectionInfo& r1, const FlatDetectionInfo& r2) const
333     {
334         return r1.sType > r2.sType;
335     }
336 };
337 
338 struct EqualByType
339 {
340     bool operator() (const FlatDetectionInfo& r1, const FlatDetectionInfo& r2) const
341     {
342         return r1.sType == r2.sType;
343     }
344 };
345 
346 class FindByType
347 {
348     OUString maType;
349 public:
350     explicit FindByType(const OUString& rType) : maType(rType) {}
351     bool operator() (const FlatDetectionInfo& rInfo) const
352     {
353         return rInfo.sType == maType;
354     }
355 };
356 
357 #if DEBUG_TYPE_DETECTION
358 void printFlatDetectionList(const char* caption, const FlatDetection& types)
359 {
360     cout << "-- " << caption << " (size=" << types.size() << ")" << endl;
361     for (auto const& item : types)
362     {
363         cout << "  type='" << item.sType << "'; match by extension (" << item.bMatchByExtension
364             << "); match by pattern (" << item.bMatchByPattern << "); pre-selected by doc service ("
365             << item.bPreselectedByDocumentService << ")" << endl;
366     }
367     cout << "--" << endl;
368 }
369 #endif
370 
371 }
372 
373 OUString SAL_CALL TypeDetection::queryTypeByDescriptor(css::uno::Sequence< css::beans::PropertyValue >& lDescriptor,
374                                                               sal_Bool                                         bAllowDeep )
375 {
376     // make the descriptor more usable :-)
377     utl::MediaDescriptor stlDescriptor(lDescriptor);
378     OUString sType, sURL;
379 
380     try
381     {
382         // SAFE -> ----------------------------------
383         osl::ClearableMutexGuard aLock(m_aLock);
384 
385         // parse given URL to split it into e.g. main and jump marks ...
386         sURL = stlDescriptor.getUnpackedValueOrDefault(utl::MediaDescriptor::PROP_URL(), OUString());
387 
388 #if OSL_DEBUG_LEVEL > 0
389         if (stlDescriptor.find( "FileName" ) != stlDescriptor.end())
390             OSL_FAIL("Detect using of deprecated and already unsupported MediaDescriptor property \"FileName\"!");
391 #endif
392 
393         css::util::URL  aURL;
394         aURL.Complete = sURL;
395         css::uno::Reference< css::util::XURLTransformer > xParser(css::util::URLTransformer::create(m_xContext));
396         xParser->parseStrict(aURL);
397 
398         OUString aSelectedFilter = stlDescriptor.getUnpackedValueOrDefault(
399             utl::MediaDescriptor::PROP_FILTERNAME(), OUString());
400         if (!aSelectedFilter.isEmpty())
401         {
402             // Caller specified the filter type.  Honor it.  Just get the default
403             // type for that filter, and bail out.
404             if (impl_validateAndSetFilterOnDescriptor(stlDescriptor, aSelectedFilter))
405                 return stlDescriptor[utl::MediaDescriptor::PROP_TYPENAME()].get<OUString>();
406         }
407 
408         FlatDetection lFlatTypes;
409         impl_getAllFormatTypes(aURL, stlDescriptor, lFlatTypes);
410 
411         aLock.clear();
412         // <- SAFE ----------------------------------
413 
414         // Properly prioritize all candidate types.
415         std::stable_sort(lFlatTypes.begin(), lFlatTypes.end(), SortByPriority());
416         auto last = std::unique(lFlatTypes.begin(), lFlatTypes.end(), EqualByType());
417         lFlatTypes.erase(last, lFlatTypes.end());
418 
419         OUString sLastChance;
420 
421         // verify every flat detected (or preselected!) type
422         // by calling its registered deep detection service.
423         // But break this loop if a type match to the given descriptor
424         // by a URL pattern(!) or if deep detection isn't allowed from
425         // outside (bAllowDeep=sal_False) or break the whole detection by
426         // throwing an exception if creation of the might needed input
427         // stream failed by e.g. an IO exception ...
428         std::vector<OUString> lUsedDetectors;
429         if (!lFlatTypes.empty())
430             sType = impl_detectTypeFlatAndDeep(stlDescriptor, lFlatTypes, bAllowDeep, lUsedDetectors, sLastChance);
431 
432         // flat detection failed
433         // pure deep detection failed
434         // => ask might existing InteractionHandler
435         // means: ask user for its decision
436         if (sType.isEmpty() && !m_bCancel)
437             sType = impl_askUserForTypeAndFilterIfAllowed(stlDescriptor);
438 
439 
440         // no real detected type - but a might valid one.
441         // update descriptor and set last chance for return.
442         if (sType.isEmpty() && !sLastChance.isEmpty() && !m_bCancel)
443         {
444             OSL_FAIL("set first flat detected type without a registered deep detection service as \"last chance\" ... nevertheless some other deep detections said \"NO\". I TRY IT!");
445             sType = sLastChance;
446         }
447     }
448     catch(const css::uno::RuntimeException&)
449     {
450         throw;
451     }
452     catch(const css::uno::Exception&)
453     {
454         TOOLS_WARN_EXCEPTION("filter.config", "caught exception while querying type of " << sURL);
455         sType.clear();
456     }
457 
458     // adapt media descriptor, so it contains the right values
459     // for type/filter name/document service/ etcpp.
460     impl_checkResultsAndAddBestFilter(stlDescriptor, sType); // Attention: sType is used as IN/OUT param here and will might be changed inside this method !!!
461     impl_validateAndSetTypeOnDescriptor(stlDescriptor, sType);
462 
463     stlDescriptor >> lDescriptor;
464     return sType;
465 }
466 
467 
468 void TypeDetection::impl_checkResultsAndAddBestFilter(utl::MediaDescriptor& rDescriptor,
469                                                       OUString&               sType      )
470 {
471     // a)
472     // Don't overwrite a might preselected filter!
473     OUString sFilter = rDescriptor.getUnpackedValueOrDefault(
474                                 utl::MediaDescriptor::PROP_FILTERNAME(),
475                                 OUString());
476     if (!sFilter.isEmpty())
477         return;
478 
479     auto & cache = TheFilterCache::get();
480 
481     // b)
482     // check a preselected document service too.
483     // Then we have to search a suitable filter within this module.
484     OUString sDocumentService = rDescriptor.getUnpackedValueOrDefault(
485                                             utl::MediaDescriptor::PROP_DOCUMENTSERVICE(),
486                                             OUString());
487     if (!sDocumentService.isEmpty())
488     {
489         try
490         {
491             OUString sRealType = sType;
492 
493             // SAFE ->
494             ::osl::ResettableMutexGuard aLock(m_aLock);
495 
496             // Attention: For executing next lines of code, We must be sure that
497             // all filters already loaded :-(
498             // That can disturb our "load on demand feature". But we have no other chance!
499             cache.load(FilterCache::E_CONTAINS_FILTERS);
500 
501             CacheItem lIProps;
502             lIProps[PROPNAME_DOCUMENTSERVICE] <<= sDocumentService;
503             lIProps[PROPNAME_TYPE           ] <<= sRealType;
504             std::vector<OUString> lFilters = cache.getMatchingItemsByProps(FilterCache::E_FILTER, lIProps);
505 
506             aLock.clear();
507             // <- SAFE
508 
509             for (auto const& filter : lFilters)
510             {
511                 // SAFE ->
512                 aLock.reset();
513                 try
514                 {
515                     CacheItem aFilter = cache.getItem(FilterCache::E_FILTER, filter);
516                     sal_Int32 nFlags  = 0;
517                     aFilter[PROPNAME_FLAGS] >>= nFlags;
518 
519                     if (static_cast<SfxFilterFlags>(nFlags) & SfxFilterFlags::IMPORT)
520                         sFilter = filter;
521                     if (static_cast<SfxFilterFlags>(nFlags) & SfxFilterFlags::PREFERED)
522                         break;
523                 }
524                 catch(const css::uno::Exception&) {}
525                 aLock.clear();
526                 // <- SAFE
527             }
528 
529             if (!sFilter.isEmpty())
530             {
531                 rDescriptor[utl::MediaDescriptor::PROP_TYPENAME()  ] <<= sRealType;
532                 rDescriptor[utl::MediaDescriptor::PROP_FILTERNAME()] <<= sFilter;
533                 sType = sRealType;
534                 return;
535             }
536         }
537         catch(const css::uno::Exception&)
538             {}
539     }
540 
541     // c)
542     // We can use the preferred filter for the specified type.
543     // Such preferred filter points:
544     // - to the default filter of the preferred application
545     // - or to any other filter if no preferred filter was set.
546     // Note: It's an optimization only!
547     // It's not guaranteed, that such preferred filter exists.
548     sFilter.clear();
549     try
550     {
551         // SAFE ->
552         osl::ClearableMutexGuard aLock(m_aLock);
553 
554         CacheItem aType = cache.getItem(FilterCache::E_TYPE, sType);
555         aType[PROPNAME_PREFERREDFILTER] >>= sFilter;
556         CacheItem aFilter = cache.getItem(FilterCache::E_FILTER, sFilter);
557 
558         aLock.clear();
559         // <- SAFE
560 
561         // no exception => found valid type and filter => set it on the given descriptor
562         rDescriptor[utl::MediaDescriptor::PROP_TYPENAME()  ] <<= sType  ;
563         rDescriptor[utl::MediaDescriptor::PROP_FILTERNAME()] <<= sFilter;
564         return;
565     }
566     catch(const css::uno::Exception&)
567         {}
568 
569     // d)
570     // Search for any import(!) filter, which is registered for this type.
571     sFilter.clear();
572     try
573     {
574         // SAFE ->
575         ::osl::ResettableMutexGuard aLock(m_aLock);
576 
577         // Attention: For executing next lines of code, We must be sure that
578         // all filters already loaded :-(
579         // That can disturb our "load on demand feature". But we have no other chance!
580         cache.load(FilterCache::E_CONTAINS_FILTERS);
581 
582         CacheItem lIProps;
583         lIProps[PROPNAME_TYPE] <<= sType;
584         std::vector<OUString> lFilters = cache.getMatchingItemsByProps(FilterCache::E_FILTER, lIProps);
585 
586         aLock.clear();
587         // <- SAFE
588 
589         for (auto const& filter : lFilters)
590         {
591             sFilter = filter;
592 
593             // SAFE ->
594             aLock.reset();
595             try
596             {
597                 CacheItem aFilter = cache.getItem(FilterCache::E_FILTER, sFilter);
598                 sal_Int32 nFlags  = 0;
599                 aFilter[PROPNAME_FLAGS] >>= nFlags;
600 
601                 if (static_cast<SfxFilterFlags>(nFlags) & SfxFilterFlags::IMPORT)
602                     break;
603             }
604             catch(const css::uno::Exception&)
605                 { continue; }
606             aLock.clear();
607             // <- SAFE
608 
609             sFilter.clear();
610         }
611 
612         if (!sFilter.isEmpty())
613         {
614             rDescriptor[utl::MediaDescriptor::PROP_TYPENAME()  ] <<= sType  ;
615             rDescriptor[utl::MediaDescriptor::PROP_FILTERNAME()] <<= sFilter;
616             return;
617         }
618     }
619     catch(const css::uno::Exception&)
620         {}
621 }
622 
623 
624 bool TypeDetection::impl_getPreselectionForType(
625     const OUString& sPreSelType, const util::URL& aParsedURL, FlatDetection& rFlatTypes, bool bDocService)
626 {
627     // Can be used to suppress execution of some parts of this method
628     // if it's already clear that detected type is valid or not.
629     // It's necessary to use shared code at the end, which update
630     // all return parameters consistency!
631     bool bBreakDetection = false;
632 
633     // Further we must know if it matches by pattern
634     // Every flat detected type by pattern won't be detected deep!
635     bool bMatchByPattern = false;
636 
637     // And we must know if a preselection must be preferred, because
638     // it matches by its extension too.
639     bool bMatchByExtension = false;
640 
641     // validate type
642     OUString sType(sPreSelType);
643     CacheItem       aType;
644     try
645     {
646         // SAFE -> --------------------------
647         osl::MutexGuard aLock(m_aLock);
648         aType = TheFilterCache::get().getItem(FilterCache::E_TYPE, sType);
649         // <- SAFE --------------------------
650     }
651     catch(const css::container::NoSuchElementException&)
652     {
653         sType.clear();
654         bBreakDetection = true;
655     }
656 
657     if (!bBreakDetection)
658     {
659         // We can't check a preselected type for a given stream!
660         // So we must believe, that it can work ...
661         if ( aParsedURL.Complete == "private:stream" )
662             bBreakDetection = true;
663     }
664 
665     if (!bBreakDetection)
666     {
667         // extract extension from URL .. to check it case-insensitive !
668         INetURLObject   aParser    (aParsedURL.Main);
669         OUString sExtension = aParser.getExtension(INetURLObject::LAST_SEGMENT       ,
670                                                           true                          ,
671                                                           INetURLObject::DecodeMechanism::WithCharset);
672         sExtension = sExtension.toAsciiLowerCase();
673 
674         // otherwise we must know, if it matches to the given URL really.
675         // especially if it matches by its extension or pattern registration.
676         std::vector<OUString> lExtensions(comphelper::sequenceToContainer< std::vector<OUString> >(aType[PROPNAME_EXTENSIONS].get<css::uno::Sequence<OUString> >() ));
677         std::vector<OUString> lURLPattern(comphelper::sequenceToContainer< std::vector<OUString> >(aType[PROPNAME_URLPATTERN].get<css::uno::Sequence<OUString> >() ));
678 
679         for (auto const& extension : lExtensions)
680         {
681             OUString sCheckExtension(extension.toAsciiLowerCase());
682             if (sCheckExtension == sExtension)
683             {
684                 bBreakDetection        = true;
685                 bMatchByExtension      = true;
686                 break;
687             }
688         }
689 
690         if (!bBreakDetection)
691         {
692             for (auto const& elem : lURLPattern)
693             {
694                 WildCard aCheck(elem);
695                 if (aCheck.Matches(aParsedURL.Main))
696                 {
697                     bMatchByPattern = true;
698                     break;
699                 }
700             }
701         }
702     }
703 
704     // if it's a valid type - set it on all return values!
705     if (!sType.isEmpty())
706     {
707         FlatDetection::iterator it = std::find_if(rFlatTypes.begin(), rFlatTypes.end(), FindByType(sType));
708         if (it != rFlatTypes.end())
709         {
710             if (bMatchByExtension)
711                 it->bMatchByExtension = true;
712             if (bMatchByPattern)
713                 it->bMatchByPattern = true;
714             if (bDocService)
715                 it->bPreselectedByDocumentService = true;
716         }
717 
718         return true;
719     }
720 
721     // not valid!
722     return false;
723 }
724 
725 void TypeDetection::impl_getPreselectionForDocumentService(
726     const OUString& sPreSelDocumentService, const util::URL& aParsedURL, FlatDetection& rFlatTypes)
727 {
728     // get all filters, which match to this doc service
729     std::vector<OUString> lFilters;
730     try
731     {
732         // SAFE -> --------------------------
733         osl::MutexGuard aLock(m_aLock);
734 
735         // Attention: For executing next lines of code, We must be sure that
736         // all filters already loaded :-(
737         // That can disturb our "load on demand feature". But we have no other chance!
738         auto & cache = TheFilterCache::get();
739         cache.load(FilterCache::E_CONTAINS_FILTERS);
740 
741         CacheItem lIProps;
742         lIProps[PROPNAME_DOCUMENTSERVICE] <<= sPreSelDocumentService;
743         lFilters = cache.getMatchingItemsByProps(FilterCache::E_FILTER, lIProps);
744         // <- SAFE --------------------------
745     }
746     catch (const css::container::NoSuchElementException&)
747     {
748         lFilters.clear();
749     }
750 
751     // step over all filters, and check if its registered type
752     // match the given URL.
753     // But use temp. list of "preselected types" instead of incoming rFlatTypes list!
754     // The reason behind: we must filter the obtained results. And copying stl entries
755     // is an easier job than removing them .-)
756     for (auto const& filter : lFilters)
757     {
758         OUString aType = impl_getTypeFromFilter(filter);
759         if (aType.isEmpty())
760             continue;
761 
762         impl_getPreselectionForType(aType, aParsedURL, rFlatTypes, true);
763     }
764 }
765 
766 OUString TypeDetection::impl_getTypeFromFilter(const OUString& rFilterName)
767 {
768     CacheItem aFilter;
769     try
770     {
771         osl::MutexGuard aLock(m_aLock);
772         aFilter = TheFilterCache::get().getItem(FilterCache::E_FILTER, rFilterName);
773     }
774     catch (const container::NoSuchElementException&)
775     {
776         return OUString();
777     }
778 
779     OUString aType;
780     aFilter[PROPNAME_TYPE] >>= aType;
781     return aType;
782 }
783 
784 void TypeDetection::impl_getAllFormatTypes(
785     const util::URL& aParsedURL, utl::MediaDescriptor const & rDescriptor, FlatDetection& rFlatTypes)
786 {
787     rFlatTypes.clear();
788 
789     // Get all filters that we have.
790     std::vector<OUString> aFilterNames;
791     try
792     {
793         osl::MutexGuard aLock(m_aLock);
794         auto & cache = TheFilterCache::get();
795         cache.load(FilterCache::E_CONTAINS_FILTERS);
796         aFilterNames = cache.getItemNames(FilterCache::E_FILTER);
797     }
798     catch (const container::NoSuchElementException&)
799     {
800         return;
801     }
802 
803     // Retrieve the default type for each of these filters, and store them.
804     for (auto const& filterName : aFilterNames)
805     {
806         OUString aType = impl_getTypeFromFilter(filterName);
807 
808         if (aType.isEmpty())
809             continue;
810 
811         FlatDetectionInfo aInfo; // all flags set to false by default.
812         aInfo.sType = aType;
813         rFlatTypes.push_back(aInfo);
814     }
815 
816     {
817         // Get all types that match the URL alone.
818         FlatDetection aFlatByURL;
819         TheFilterCache::get().detectFlatForURL(aParsedURL, aFlatByURL);
820         for (auto const& elem : aFlatByURL)
821         {
822             FlatDetection::iterator itPos = std::find_if(rFlatTypes.begin(), rFlatTypes.end(), FindByType(elem.sType));
823             if (itPos == rFlatTypes.end())
824                 // Not in the list yet.
825                 rFlatTypes.push_back(elem);
826             else
827             {
828                 // Already in the list. Update the flags.
829                 FlatDetectionInfo& rInfo = *itPos;
830                 const FlatDetectionInfo& rThisInfo = elem;
831                 if (rThisInfo.bMatchByExtension)
832                     rInfo.bMatchByExtension = true;
833                 if (rThisInfo.bMatchByPattern)
834                     rInfo.bMatchByPattern = true;
835                 if (rThisInfo.bPreselectedByDocumentService)
836                     rInfo.bPreselectedByDocumentService = true;
837             }
838         }
839     }
840 
841     // Remove duplicates.
842     std::stable_sort(rFlatTypes.begin(), rFlatTypes.end(), SortByType());
843     auto last = std::unique(rFlatTypes.begin(), rFlatTypes.end(), EqualByType());
844     rFlatTypes.erase(last, rFlatTypes.end());
845 
846     // Mark pre-selected type (if any) to have it prioritized.
847     OUString sSelectedType = rDescriptor.getUnpackedValueOrDefault(utl::MediaDescriptor::PROP_TYPENAME(), OUString());
848     if (!sSelectedType.isEmpty())
849         impl_getPreselectionForType(sSelectedType, aParsedURL, rFlatTypes, false);
850 
851     // Mark all types preferred by the current document service, to have it prioritized.
852     OUString sSelectedDoc = rDescriptor.getUnpackedValueOrDefault(utl::MediaDescriptor::PROP_DOCUMENTSERVICE(), OUString());
853     if (!sSelectedDoc.isEmpty())
854         impl_getPreselectionForDocumentService(sSelectedDoc, aParsedURL, rFlatTypes);
855 }
856 
857 
858 OUString TypeDetection::impl_detectTypeFlatAndDeep(      utl::MediaDescriptor& rDescriptor   ,
859                                                           const FlatDetection&                 lFlatTypes    ,
860                                                                 bool                       bAllowDeep    ,
861                                                                 std::vector<OUString>&         rUsedDetectors,
862                                                                 OUString&               rLastChance   )
863 {
864     // reset it everytimes, so the outside code can distinguish between
865     // a set and a not set value.
866     rLastChance.clear();
867     rUsedDetectors.clear();
868 
869     // step over all possible types for this URL.
870     // solutions:
871     // a) no types                                => no detection
872     // b) deep detection not allowed              => return first valid type of list (because it's the preferred or the first valid one)
873     //    or(!) match by URLPattern               => in such case a deep detection will be suppressed!
874     // c) type has no detect service              => safe the first occurred type without a detect service
875     //                                               as "last chance"(!). It will be used outside of this method
876     //                                               if no further type could be detected.
877     //                                               It must be the first one, because it can be a preferred type.
878     //                                               Our types list was sorted by such criteria!
879     // d) detect service return a valid result    => return its decision
880     // e) detect service return an invalid result
881     //    or any needed information could not be
882     //    obtained from the cache                 => ignore it, and continue with search
883 
884     for (auto const& flatTypeInfo : lFlatTypes)
885     {
886         if (m_bCancel)
887             break;
888         OUString sFlatType = flatTypeInfo.sType;
889 
890         if (!impl_validateAndSetTypeOnDescriptor(rDescriptor, sFlatType))
891             continue;
892 
893         // b)
894         if (
895             (!bAllowDeep                  ) ||
896             (flatTypeInfo.bMatchByPattern)
897            )
898         {
899             return sFlatType;
900         }
901 
902         try
903         {
904             // SAFE -> ----------------------------------
905             osl::ClearableMutexGuard aLock(m_aLock);
906             CacheItem aType = TheFilterCache::get().getItem(FilterCache::E_TYPE, sFlatType);
907             aLock.clear();
908 
909             OUString sDetectService;
910             aType[PROPNAME_DETECTSERVICE] >>= sDetectService;
911 
912             // c)
913             if (sDetectService.isEmpty())
914             {
915                 // flat detected types without any registered deep detection service and not
916                 // preselected by the user can be used as LAST CHANCE in case no other type could
917                 // be detected. Of course only the first type without deep detector can be used.
918                 // Further ones has to be ignored.
919                 if (rLastChance.isEmpty())
920                     rLastChance = sFlatType;
921 
922                 continue;
923             }
924 
925             // don't forget to add every real asked deep detection service here.
926             // Such detectors will be ignored if may be "impl_detectTypeDeepOnly()"
927             // must be called later!
928             rUsedDetectors.push_back(sDetectService);
929             OUString sDeepType = impl_askDetectService(sDetectService, rDescriptor);
930 
931             // d)
932             if (!sDeepType.isEmpty())
933                 return sDeepType;
934         }
935         catch(const css::container::NoSuchElementException&)
936             {}
937         // e)
938     }
939 
940     return OUString();
941     // <- SAFE ----------------------------------
942 }
943 
944 void TypeDetection::impl_seekStreamToZero(utl::MediaDescriptor const & rDescriptor)
945 {
946     // try to seek to 0 ...
947     // But because XSeekable is an optional interface ... try it only .-)
948     css::uno::Reference< css::io::XInputStream > xStream = rDescriptor.getUnpackedValueOrDefault(
949                                                             utl::MediaDescriptor::PROP_INPUTSTREAM(),
950                                                             css::uno::Reference< css::io::XInputStream >());
951     css::uno::Reference< css::io::XSeekable > xSeek(xStream, css::uno::UNO_QUERY);
952     if (xSeek.is())
953     {
954         try
955         {
956             xSeek->seek(0);
957         }
958         catch(const css::uno::RuntimeException&)
959         {
960             throw;
961         }
962         catch(const css::uno::Exception&)
963         {
964         }
965     }
966 }
967 
968 OUString TypeDetection::impl_askDetectService(const OUString&               sDetectService,
969                                                            utl::MediaDescriptor& rDescriptor   )
970 {
971     // Open the stream and add it to the media descriptor if this method is called for the first time.
972     // All following requests to this method will detect, that there already exists a stream .-)
973     // Attention: This method throws an exception if the stream could not be opened.
974     // It's important to break any further detection in such case.
975     // Catch it on the highest detection level only !!!
976     impl_openStream(rDescriptor);
977 
978     // seek to 0 is an optional feature to be more robust against
979     // "simple implemented detect services" .-)
980     impl_seekStreamToZero(rDescriptor);
981 
982     css::uno::Reference< css::document::XExtendedFilterDetection > xDetector;
983     css::uno::Reference< css::uno::XComponentContext >         xContext;
984 
985     // SAFE ->
986     {
987         osl::MutexGuard aLock(m_aLock);
988         xContext = m_xContext;
989     }
990     // <- SAFE
991 
992     try
993     {
994         // Attention! If e.g. an office module was not installed sometimes we
995         // find a registered detect service, which is referred inside the
996         // configuration ... but not really installed. On the other side we use
997         // third party components here, which can make trouble anyway.  So we
998         // should handle errors during creation of such services more
999         // gracefully .-)
1000         xDetector.set(
1001                 xContext->getServiceManager()->createInstanceWithContext(sDetectService, xContext),
1002                 css::uno::UNO_QUERY_THROW);
1003     }
1004     catch (...)
1005     {
1006     }
1007 
1008     if ( ! xDetector.is())
1009         return OUString();
1010 
1011     OUString sDeepType;
1012     try
1013     {
1014         // start deep detection
1015         // Don't forget to convert stl descriptor to its uno representation.
1016 
1017         /* Attention!
1018                 You have to use an explicit instance of this uno sequence...
1019                 Because it's used as an in out parameter. And in case of a temp. used object
1020                 we will run into memory corruptions!
1021         */
1022         css::uno::Sequence< css::beans::PropertyValue > lDescriptor;
1023         rDescriptor >> lDescriptor;
1024         sDeepType = xDetector->detect(lDescriptor);
1025         rDescriptor << lDescriptor;
1026     }
1027     catch (...)
1028     {
1029         // We should ignore errors here.
1030         // Thrown exceptions mostly will end in crash recovery...
1031         // But might be we find another deep detection service which can detect the same
1032         // document without a problem .-)
1033         sDeepType.clear();
1034     }
1035 
1036     // seek to 0 is an optional feature to be more robust against
1037     // "simple implemented detect services" .-)
1038     impl_seekStreamToZero(rDescriptor);
1039 
1040     // analyze the results
1041     // a) detect service returns "" => return "" too and remove TYPE/FILTER prop from descriptor
1042     // b) returned type is unknown  => return "" too and remove TYPE/FILTER prop from descriptor
1043     // c) returned type is valid    => check TYPE/FILTER props inside descriptor and return the type
1044 
1045     // this special helper checks for a valid type
1046     // and set right values on the descriptor!
1047     bool bValidType = impl_validateAndSetTypeOnDescriptor(rDescriptor, sDeepType);
1048     if (bValidType)
1049         return sDeepType;
1050 
1051     return OUString();
1052 }
1053 
1054 
1055 OUString TypeDetection::impl_askUserForTypeAndFilterIfAllowed(utl::MediaDescriptor& rDescriptor)
1056 {
1057     css::uno::Reference< css::task::XInteractionHandler > xInteraction =
1058         rDescriptor.getUnpackedValueOrDefault(utl::MediaDescriptor::PROP_INTERACTIONHANDLER(),
1059         css::uno::Reference< css::task::XInteractionHandler >());
1060 
1061     if (!xInteraction.is())
1062         return OUString();
1063 
1064     OUString sURL =
1065         rDescriptor.getUnpackedValueOrDefault(utl::MediaDescriptor::PROP_URL(),
1066         OUString());
1067 
1068     css::uno::Reference< css::io::XInputStream > xStream =
1069         rDescriptor.getUnpackedValueOrDefault(utl::MediaDescriptor::PROP_INPUTSTREAM(),
1070         css::uno::Reference< css::io::XInputStream >());
1071 
1072     // Don't disturb the user for "non existing files - means empty URLs" or
1073     // if we were forced to detect a stream.
1074     // Reason behind: we must be sure to ask user for "unknown contents" only...
1075     // and not for "missing files". Especially if detection is done by a stream only
1076     // we can't check if the stream points to an "existing content"!
1077     if (
1078         (sURL.isEmpty()                                     ) || // "non existing file" ?
1079         (!xStream.is()                                         ) || // non existing file !
1080         (sURL.equalsIgnoreAsciiCase("private:stream"))    // not a good idea .-)
1081        )
1082         return OUString();
1083 
1084     try
1085     {
1086         // create a new request to ask user for its decision about the usable filter
1087         ::framework::RequestFilterSelect aRequest(sURL);
1088         xInteraction->handle(aRequest.GetRequest());
1089 
1090         // "Cancel" pressed? => return with error
1091         if (aRequest.isAbort())
1092             return OUString();
1093 
1094         // "OK" pressed => verify the selected filter, get its corresponding
1095         // type and return it. (BTW: We must update the media descriptor here ...)
1096         // The user selected explicitly a filter ... but normally we are interested on
1097         // a type here only. But we must be sure, that the selected filter is used
1098         // too and no ambiguous filter registration disturb us .-)
1099 
1100         OUString sFilter = aRequest.getFilter();
1101         if (!impl_validateAndSetFilterOnDescriptor(rDescriptor, sFilter))
1102             return OUString();
1103 
1104         OUString sType;
1105         rDescriptor[utl::MediaDescriptor::PROP_TYPENAME()] >>= sType;
1106         return sType;
1107     }
1108     catch(const css::uno::Exception&)
1109         {}
1110 
1111     return OUString();
1112 }
1113 
1114 
1115 void TypeDetection::impl_openStream(utl::MediaDescriptor& rDescriptor)
1116 {
1117     bool bSuccess = false;
1118     OUString sURL = rDescriptor.getUnpackedValueOrDefault( utl::MediaDescriptor::PROP_URL(), OUString() );
1119     bool bRequestedReadOnly = rDescriptor.getUnpackedValueOrDefault( utl::MediaDescriptor::PROP_READONLY(), false );
1120     if ( comphelper::isFileUrl( sURL ) )
1121     {
1122         // OOo uses own file locking mechanics in case of local file
1123         bSuccess = rDescriptor.addInputStreamOwnLock();
1124     }
1125     else
1126         bSuccess = rDescriptor.addInputStream();
1127 
1128     if ( !bSuccess )
1129         throw css::uno::Exception(
1130             "Could not open stream for <" + sURL + ">",
1131             static_cast<OWeakObject *>(this));
1132 
1133     if ( !bRequestedReadOnly )
1134     {
1135         // The MediaDescriptor implementation adds ReadOnly argument if the file can not be opened for writing
1136         // this argument should be either removed or an additional argument should be added so that application
1137         // can separate the case when the user explicitly requests readonly document.
1138         // The current solution is to remove it here.
1139         rDescriptor.erase( utl::MediaDescriptor::PROP_READONLY() );
1140     }
1141 }
1142 
1143 
1144 void TypeDetection::impl_removeTypeFilterFromDescriptor(utl::MediaDescriptor& rDescriptor)
1145 {
1146     utl::MediaDescriptor::iterator pItType   = rDescriptor.find(utl::MediaDescriptor::PROP_TYPENAME()  );
1147     utl::MediaDescriptor::iterator pItFilter = rDescriptor.find(utl::MediaDescriptor::PROP_FILTERNAME());
1148     if (pItType != rDescriptor.end())
1149         rDescriptor.erase(pItType);
1150     if (pItFilter != rDescriptor.end())
1151         rDescriptor.erase(pItFilter);
1152 }
1153 
1154 
1155 bool TypeDetection::impl_validateAndSetTypeOnDescriptor(      utl::MediaDescriptor& rDescriptor,
1156                                                             const OUString&               sType      )
1157 {
1158     // SAFE ->
1159     {
1160         osl::MutexGuard aLock(m_aLock);
1161         if (TheFilterCache::get().hasItem(FilterCache::E_TYPE, sType))
1162         {
1163             rDescriptor[utl::MediaDescriptor::PROP_TYPENAME()] <<= sType;
1164             return true;
1165         }
1166     }
1167     // <- SAFE
1168 
1169     // remove all related information from the descriptor
1170     impl_removeTypeFilterFromDescriptor(rDescriptor);
1171     return false;
1172 }
1173 
1174 
1175 bool TypeDetection::impl_validateAndSetFilterOnDescriptor(      utl::MediaDescriptor& rDescriptor,
1176                                                               const OUString&               sFilter    )
1177 {
1178     try
1179     {
1180         // SAFE ->
1181         osl::ClearableMutexGuard aLock(m_aLock);
1182 
1183         auto & cache = TheFilterCache::get();
1184         CacheItem aFilter = cache.getItem(FilterCache::E_FILTER, sFilter);
1185         OUString sType;
1186         aFilter[PROPNAME_TYPE] >>= sType;
1187 
1188         aLock.clear();
1189         // <- SAFE
1190 
1191         // found valid type and filter => set it on the given descriptor
1192         rDescriptor[utl::MediaDescriptor::PROP_TYPENAME()  ] <<= sType  ;
1193         rDescriptor[utl::MediaDescriptor::PROP_FILTERNAME()] <<= sFilter;
1194         return true;
1195     }
1196     catch(const css::container::NoSuchElementException&){}
1197 
1198     // remove all related information from the descriptor
1199     impl_removeTypeFilterFromDescriptor(rDescriptor);
1200     return false;
1201 }
1202 
1203 
1204 OUString TypeDetection::impl_getImplementationName()
1205 {
1206     return "com.sun.star.comp.filter.config.TypeDetection";
1207 }
1208 
1209 
1210 css::uno::Sequence< OUString > TypeDetection::impl_getSupportedServiceNames()
1211 {
1212     return { "com.sun.star.document.TypeDetection" };
1213 }
1214 
1215 
1216 css::uno::Reference< css::uno::XInterface > TypeDetection::impl_createInstance(const css::uno::Reference< css::lang::XMultiServiceFactory >& xSMGR)
1217 {
1218     TypeDetection* pNew = new TypeDetection( comphelper::getComponentContext(xSMGR) );
1219     return css::uno::Reference< css::uno::XInterface >(static_cast< css::document::XTypeDetection* >(pNew), css::uno::UNO_QUERY);
1220 }
1221 
1222     } // namespace config
1223 } // namespace filter
1224 
1225 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
1226