xref: /core/ucb/source/regexp/regexp.cxx (revision 310ce513)
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <regexp.hxx>
21 
22 #include <cstddef>
23 
24 #include <osl/diagnose.h>
25 #include <com/sun/star/lang/IllegalArgumentException.hpp>
26 #include <rtl/character.hxx>
27 #include <rtl/ustrbuf.hxx>
28 #include <rtl/ustring.hxx>
29 
30 using namespace com::sun::star;
31 using namespace ucb_impl;
32 
33 
34 //  Regexp
35 
36 
37 inline Regexp::Regexp(Kind eTheKind, OUString const & rThePrefix,
38                       bool bTheEmptyDomain, OUString const & rTheInfix,
39                       bool bTheTranslation,
40                       OUString const & rTheReversePrefix):
41     m_eKind(eTheKind),
42     m_aPrefix(rThePrefix),
43     m_aInfix(rTheInfix),
44     m_aReversePrefix(rTheReversePrefix),
45     m_bEmptyDomain(bTheEmptyDomain),
46     m_bTranslation(bTheTranslation)
47 {
48     OSL_ASSERT(m_eKind == KIND_DOMAIN
49                || (!m_bEmptyDomain && m_aInfix.isEmpty()));
50     OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty());
51 }
52 
53 
54 namespace {
55 
56 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
57                            sal_Unicode const * pEnd,
58                            OUString const & rString)
59 {
60     sal_Unicode const * p = *pBegin;
61 
62     sal_Unicode const * q = rString.getStr();
63     sal_Unicode const * qEnd = q + rString.getLength();
64 
65     if (pEnd - p < qEnd - q)
66         return false;
67 
68     while (q != qEnd)
69     {
70         if (rtl::compareIgnoreAsciiCase(*p++, *q++) != 0)
71             return false;
72     }
73 
74     *pBegin = p;
75     return true;
76 }
77 
78 }
79 
80 bool Regexp::matches(OUString const & rString) const
81 {
82     sal_Unicode const * pBegin = rString.getStr();
83     sal_Unicode const * pEnd = pBegin + rString.getLength();
84 
85     bool bMatches = false;
86 
87     sal_Unicode const * p = pBegin;
88     if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
89     {
90         switch (m_eKind)
91         {
92             case KIND_PREFIX:
93                 bMatches = true;
94                 break;
95 
96             case KIND_AUTHORITY:
97                 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
98                 break;
99 
100             case KIND_DOMAIN:
101                 if (!m_bEmptyDomain)
102                 {
103                     if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
104                         break;
105                     ++p;
106                 }
107                 for (;;)
108                 {
109                     sal_Unicode const * q = p;
110                     if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
111                         && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
112                     {
113                         bMatches = true;
114                         break;
115                     }
116 
117                     if (p == pEnd)
118                         break;
119 
120                     sal_Unicode c = *p++;
121                     if (c == '/' || c == '?' || c == '#')
122                         break;
123                 }
124                 break;
125         }
126     }
127 
128     return bMatches;
129 }
130 
131 
132 namespace {
133 
134 bool isScheme(OUString const & rString, bool bColon)
135 {
136     // Return true if rString matches <scheme> (plus a trailing ":" if bColon
137     // is true) from RFC 2396:
138     sal_Unicode const * p = rString.getStr();
139     sal_Unicode const * pEnd = p + rString.getLength();
140     if (p != pEnd && rtl::isAsciiAlpha(*p))
141         for (++p;;)
142         {
143             if (p == pEnd)
144                 return !bColon;
145             sal_Unicode c = *p++;
146             if (!(rtl::isAsciiAlphanumeric(c)
147                   || c == '+' || c == '-' || c == '.'))
148                 return bColon && c == ':' && p == pEnd;
149         }
150     return false;
151 }
152 
153 void appendStringLiteral(OUStringBuffer * pBuffer,
154                          OUString const & rString)
155 {
156     OSL_ASSERT(pBuffer);
157 
158     pBuffer->append('"');
159     sal_Unicode const * p = rString.getStr();
160     sal_Unicode const * pEnd = p + rString.getLength();
161     while (p != pEnd)
162     {
163         sal_Unicode c = *p++;
164         if (c == '"' || c == '\\')
165             pBuffer->append('\\');
166         pBuffer->append(c);
167     }
168     pBuffer->append('"');
169 }
170 
171 }
172 
173 OUString Regexp::getRegexp() const
174 {
175     if (m_bTranslation)
176     {
177         OUStringBuffer aBuffer;
178         if (!m_aPrefix.isEmpty())
179             appendStringLiteral(&aBuffer, m_aPrefix);
180         switch (m_eKind)
181         {
182             case KIND_PREFIX:
183                 aBuffer.append("(.*)");
184                 break;
185 
186             case KIND_AUTHORITY:
187                 aBuffer.append("(([/?#].*)?)");
188                 break;
189 
190             case KIND_DOMAIN:
191                 aBuffer.append("([^/?#]");
192                 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
193                 if (!m_aInfix.isEmpty())
194                     appendStringLiteral(&aBuffer, m_aInfix);
195                 aBuffer.append("([/?#].*)?)");
196                 break;
197         }
198         aBuffer.append("->");
199         if (!m_aReversePrefix.isEmpty())
200             appendStringLiteral(&aBuffer, m_aReversePrefix);
201         aBuffer.append("\\1");
202         return aBuffer.makeStringAndClear();
203     }
204     else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
205         return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
206     else
207     {
208         OUStringBuffer aBuffer;
209         if (!m_aPrefix.isEmpty())
210             appendStringLiteral(&aBuffer, m_aPrefix);
211         switch (m_eKind)
212         {
213             case KIND_PREFIX:
214                 aBuffer.append(".*");
215                 break;
216 
217             case KIND_AUTHORITY:
218                 aBuffer.append("([/?#].*)?");
219                 break;
220 
221             case KIND_DOMAIN:
222                 aBuffer.append("[^/?#]");
223                 aBuffer.append( m_bEmptyDomain ? '*' : '+' );
224                 if (!m_aInfix.isEmpty())
225                     appendStringLiteral(&aBuffer, m_aInfix);
226                 aBuffer.append("([/?#].*)?");
227                 break;
228         }
229         return aBuffer.makeStringAndClear();
230     }
231 }
232 
233 
234 namespace {
235 
236 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
237                  sal_Char const * pString, size_t nStringLength)
238 {
239     sal_Unicode const * p = *pBegin;
240 
241     unsigned char const * q = reinterpret_cast< unsigned char const * >(pString);
242     unsigned char const * qEnd = q + nStringLength;
243 
244     if (pEnd - p < qEnd - q)
245         return false;
246 
247     while (q != qEnd)
248     {
249         sal_Unicode c1 = *p++;
250         sal_Unicode c2 = *q++;
251         if (c1 != c2)
252             return false;
253     }
254 
255     *pBegin = p;
256     return true;
257 }
258 
259 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
260                        OUString * pString)
261 {
262     sal_Unicode const * p = *pBegin;
263 
264     if (p == pEnd || *p++ != '"')
265         return false;
266 
267     OUStringBuffer aBuffer;
268     for (;;)
269     {
270         if (p == pEnd)
271             return false;
272         sal_Unicode c = *p++;
273         if (c == '"')
274             break;
275         if (c == '\\')
276         {
277             if (p == pEnd)
278                 return false;
279             c = *p++;
280             if (c != '"' && c != '\\')
281                 return false;
282         }
283         aBuffer.append(c);
284     }
285 
286     *pBegin = p;
287     *pString = aBuffer.makeStringAndClear();
288     return true;
289 }
290 
291 }
292 
293 Regexp Regexp::parse(OUString const & rRegexp)
294 {
295     // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
296     // where <scheme> is as defined in RFC 2396:
297     if (isScheme(rRegexp, false))
298         return Regexp(Regexp::KIND_PREFIX,
299                       rRegexp + ":",
300                       false,
301                       OUString(),
302                       false,
303                       OUString());
304 
305     sal_Unicode const * p = rRegexp.getStr();
306     sal_Unicode const * pEnd = p + rRegexp.getLength();
307 
308     OUString aPrefix;
309     scanStringLiteral(&p, pEnd, &aPrefix);
310 
311     if (p == pEnd)
312         throw lang::IllegalArgumentException();
313 
314     // This and the matchString() calls below are some of the few places where
315     // RTL_CONSTASCII_STRINGPARAM() should NOT be removed.
316     // (c.f. https://gerrit.libreoffice.org/3117)
317     if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
318     {
319         if (p != pEnd)
320             throw lang::IllegalArgumentException();
321 
322         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
323                       false, OUString());
324     }
325     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
326     {
327         OUString aReversePrefix;
328         scanStringLiteral(&p, pEnd, &aReversePrefix);
329 
330         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
331             || p != pEnd)
332             throw lang::IllegalArgumentException();
333 
334         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
335                       true, aReversePrefix);
336     }
337     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
338     {
339         if (p != pEnd)
340             throw lang::IllegalArgumentException();
341 
342         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
343                       false, OUString());
344     }
345     else if (matchString(&p, pEnd,
346                          RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
347     {
348         OUString aReversePrefix;
349         if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
350               && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
351               && p == pEnd))
352             throw lang::IllegalArgumentException();
353 
354         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
355                       true, aReversePrefix);
356     }
357     else
358     {
359         bool bOpen = false;
360         if (p != pEnd && *p == '(')
361         {
362             ++p;
363             bOpen = true;
364         }
365 
366         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
367             throw lang::IllegalArgumentException();
368 
369         if (p == pEnd || (*p != '*' && *p != '+'))
370             throw lang::IllegalArgumentException();
371         bool bEmptyDomain = *p++ == '*';
372 
373         OUString aInfix;
374         scanStringLiteral(&p, pEnd, &aInfix);
375 
376         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
377             throw lang::IllegalArgumentException();
378 
379         OUString aReversePrefix;
380         if (bOpen
381             && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
382                  && scanStringLiteral(&p, pEnd, &aReversePrefix)
383                  && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
384             throw lang::IllegalArgumentException();
385 
386         if (p != pEnd)
387             throw lang::IllegalArgumentException();
388 
389         return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
390                       bOpen, aReversePrefix);
391     }
392 }
393 
394 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
395