1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 /* 3 * This file is part of the LibreOffice project. 4 * 5 * This Source Code Form is subject to the terms of the Mozilla Public 6 * License, v. 2.0. If a copy of the MPL was not distributed with this 7 * file, You can obtain one at http://mozilla.org/MPL/2.0/. 8 * 9 * This file incorporates work covered by the following license notice: 10 * 11 * Licensed to the Apache Software Foundation (ASF) under one or more 12 * contributor license agreements. See the NOTICE file distributed 13 * with this work for additional information regarding copyright 14 * ownership. The ASF licenses this file to you under the Apache 15 * License, Version 2.0 (the "License"); you may not use this file 16 * except in compliance with the License. You may obtain a copy of 17 * the License at http://www.apache.org/licenses/LICENSE-2.0 . 18 */ 19 20 #include <regexp.hxx> 21 22 #include <cstddef> 23 24 #include <osl/diagnose.h> 25 #include <com/sun/star/lang/IllegalArgumentException.hpp> 26 #include <rtl/character.hxx> 27 #include <rtl/ustrbuf.hxx> 28 #include <rtl/ustring.hxx> 29 30 using namespace com::sun::star; 31 using namespace ucb_impl; 32 33 34 // Regexp 35 36 37 inline Regexp::Regexp(Kind eTheKind, OUString const & rThePrefix, 38 bool bTheEmptyDomain, OUString const & rTheInfix, 39 bool bTheTranslation, 40 OUString const & rTheReversePrefix): 41 m_eKind(eTheKind), 42 m_aPrefix(rThePrefix), 43 m_aInfix(rTheInfix), 44 m_aReversePrefix(rTheReversePrefix), 45 m_bEmptyDomain(bTheEmptyDomain), 46 m_bTranslation(bTheTranslation) 47 { 48 OSL_ASSERT(m_eKind == KIND_DOMAIN 49 || (!m_bEmptyDomain && m_aInfix.isEmpty())); 50 OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty()); 51 } 52 53 54 namespace { 55 56 bool matchStringIgnoreCase(sal_Unicode const ** pBegin, 57 sal_Unicode const * pEnd, 58 OUString const & rString) 59 { 60 sal_Unicode const * p = *pBegin; 61 62 sal_Unicode const * q = rString.getStr(); 63 sal_Unicode const * qEnd = q + rString.getLength(); 64 65 if (pEnd - p < qEnd - q) 66 return false; 67 68 while (q != qEnd) 69 { 70 if (rtl::compareIgnoreAsciiCase(*p++, *q++) != 0) 71 return false; 72 } 73 74 *pBegin = p; 75 return true; 76 } 77 78 } 79 80 bool Regexp::matches(OUString const & rString) const 81 { 82 sal_Unicode const * pBegin = rString.getStr(); 83 sal_Unicode const * pEnd = pBegin + rString.getLength(); 84 85 bool bMatches = false; 86 87 sal_Unicode const * p = pBegin; 88 if (matchStringIgnoreCase(&p, pEnd, m_aPrefix)) 89 { 90 switch (m_eKind) 91 { 92 case KIND_PREFIX: 93 bMatches = true; 94 break; 95 96 case KIND_AUTHORITY: 97 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#'; 98 break; 99 100 case KIND_DOMAIN: 101 if (!m_bEmptyDomain) 102 { 103 if (p == pEnd || *p == '/' || *p == '?' || *p == '#') 104 break; 105 ++p; 106 } 107 for (;;) 108 { 109 sal_Unicode const * q = p; 110 if (matchStringIgnoreCase(&q, pEnd, m_aInfix) 111 && (q == pEnd || *q == '/' || *q == '?' || *q == '#')) 112 { 113 bMatches = true; 114 break; 115 } 116 117 if (p == pEnd) 118 break; 119 120 sal_Unicode c = *p++; 121 if (c == '/' || c == '?' || c == '#') 122 break; 123 } 124 break; 125 } 126 } 127 128 return bMatches; 129 } 130 131 132 namespace { 133 134 bool isScheme(OUString const & rString, bool bColon) 135 { 136 // Return true if rString matches <scheme> (plus a trailing ":" if bColon 137 // is true) from RFC 2396: 138 sal_Unicode const * p = rString.getStr(); 139 sal_Unicode const * pEnd = p + rString.getLength(); 140 if (p != pEnd && rtl::isAsciiAlpha(*p)) 141 for (++p;;) 142 { 143 if (p == pEnd) 144 return !bColon; 145 sal_Unicode c = *p++; 146 if (!(rtl::isAsciiAlphanumeric(c) 147 || c == '+' || c == '-' || c == '.')) 148 return bColon && c == ':' && p == pEnd; 149 } 150 return false; 151 } 152 153 void appendStringLiteral(OUStringBuffer * pBuffer, 154 OUString const & rString) 155 { 156 OSL_ASSERT(pBuffer); 157 158 pBuffer->append('"'); 159 sal_Unicode const * p = rString.getStr(); 160 sal_Unicode const * pEnd = p + rString.getLength(); 161 while (p != pEnd) 162 { 163 sal_Unicode c = *p++; 164 if (c == '"' || c == '\\') 165 pBuffer->append('\\'); 166 pBuffer->append(c); 167 } 168 pBuffer->append('"'); 169 } 170 171 } 172 173 OUString Regexp::getRegexp() const 174 { 175 if (m_bTranslation) 176 { 177 OUStringBuffer aBuffer; 178 if (!m_aPrefix.isEmpty()) 179 appendStringLiteral(&aBuffer, m_aPrefix); 180 switch (m_eKind) 181 { 182 case KIND_PREFIX: 183 aBuffer.append("(.*)"); 184 break; 185 186 case KIND_AUTHORITY: 187 aBuffer.append("(([/?#].*)?)"); 188 break; 189 190 case KIND_DOMAIN: 191 aBuffer.append("([^/?#]"); 192 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+')); 193 if (!m_aInfix.isEmpty()) 194 appendStringLiteral(&aBuffer, m_aInfix); 195 aBuffer.append("([/?#].*)?)"); 196 break; 197 } 198 aBuffer.append("->"); 199 if (!m_aReversePrefix.isEmpty()) 200 appendStringLiteral(&aBuffer, m_aReversePrefix); 201 aBuffer.append("\\1"); 202 return aBuffer.makeStringAndClear(); 203 } 204 else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true)) 205 return m_aPrefix.copy(0, m_aPrefix.getLength() - 1); 206 else 207 { 208 OUStringBuffer aBuffer; 209 if (!m_aPrefix.isEmpty()) 210 appendStringLiteral(&aBuffer, m_aPrefix); 211 switch (m_eKind) 212 { 213 case KIND_PREFIX: 214 aBuffer.append(".*"); 215 break; 216 217 case KIND_AUTHORITY: 218 aBuffer.append("([/?#].*)?"); 219 break; 220 221 case KIND_DOMAIN: 222 aBuffer.append("[^/?#]"); 223 aBuffer.append( m_bEmptyDomain ? '*' : '+' ); 224 if (!m_aInfix.isEmpty()) 225 appendStringLiteral(&aBuffer, m_aInfix); 226 aBuffer.append("([/?#].*)?"); 227 break; 228 } 229 return aBuffer.makeStringAndClear(); 230 } 231 } 232 233 234 namespace { 235 236 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd, 237 sal_Char const * pString, size_t nStringLength) 238 { 239 sal_Unicode const * p = *pBegin; 240 241 unsigned char const * q = reinterpret_cast< unsigned char const * >(pString); 242 unsigned char const * qEnd = q + nStringLength; 243 244 if (pEnd - p < qEnd - q) 245 return false; 246 247 while (q != qEnd) 248 { 249 sal_Unicode c1 = *p++; 250 sal_Unicode c2 = *q++; 251 if (c1 != c2) 252 return false; 253 } 254 255 *pBegin = p; 256 return true; 257 } 258 259 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd, 260 OUString * pString) 261 { 262 sal_Unicode const * p = *pBegin; 263 264 if (p == pEnd || *p++ != '"') 265 return false; 266 267 OUStringBuffer aBuffer; 268 for (;;) 269 { 270 if (p == pEnd) 271 return false; 272 sal_Unicode c = *p++; 273 if (c == '"') 274 break; 275 if (c == '\\') 276 { 277 if (p == pEnd) 278 return false; 279 c = *p++; 280 if (c != '"' && c != '\\') 281 return false; 282 } 283 aBuffer.append(c); 284 } 285 286 *pBegin = p; 287 *pString = aBuffer.makeStringAndClear(); 288 return true; 289 } 290 291 } 292 293 Regexp Regexp::parse(OUString const & rRegexp) 294 { 295 // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*' 296 // where <scheme> is as defined in RFC 2396: 297 if (isScheme(rRegexp, false)) 298 return Regexp(Regexp::KIND_PREFIX, 299 rRegexp + ":", 300 false, 301 OUString(), 302 false, 303 OUString()); 304 305 sal_Unicode const * p = rRegexp.getStr(); 306 sal_Unicode const * pEnd = p + rRegexp.getLength(); 307 308 OUString aPrefix; 309 scanStringLiteral(&p, pEnd, &aPrefix); 310 311 if (p == pEnd) 312 throw lang::IllegalArgumentException(); 313 314 // This and the matchString() calls below are some of the few places where 315 // RTL_CONSTASCII_STRINGPARAM() should NOT be removed. 316 // (c.f. https://gerrit.libreoffice.org/3117) 317 if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*"))) 318 { 319 if (p != pEnd) 320 throw lang::IllegalArgumentException(); 321 322 return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(), 323 false, OUString()); 324 } 325 else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->"))) 326 { 327 OUString aReversePrefix; 328 scanStringLiteral(&p, pEnd, &aReversePrefix); 329 330 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1")) 331 || p != pEnd) 332 throw lang::IllegalArgumentException(); 333 334 return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(), 335 true, aReversePrefix); 336 } 337 else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"))) 338 { 339 if (p != pEnd) 340 throw lang::IllegalArgumentException(); 341 342 return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(), 343 false, OUString()); 344 } 345 else if (matchString(&p, pEnd, 346 RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->"))) 347 { 348 OUString aReversePrefix; 349 if (!(scanStringLiteral(&p, pEnd, &aReversePrefix) 350 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1")) 351 && p == pEnd)) 352 throw lang::IllegalArgumentException(); 353 354 return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(), 355 true, aReversePrefix); 356 } 357 else 358 { 359 bool bOpen = false; 360 if (p != pEnd && *p == '(') 361 { 362 ++p; 363 bOpen = true; 364 } 365 366 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]"))) 367 throw lang::IllegalArgumentException(); 368 369 if (p == pEnd || (*p != '*' && *p != '+')) 370 throw lang::IllegalArgumentException(); 371 bool bEmptyDomain = *p++ == '*'; 372 373 OUString aInfix; 374 scanStringLiteral(&p, pEnd, &aInfix); 375 376 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"))) 377 throw lang::IllegalArgumentException(); 378 379 OUString aReversePrefix; 380 if (bOpen 381 && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->")) 382 && scanStringLiteral(&p, pEnd, &aReversePrefix) 383 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1")))) 384 throw lang::IllegalArgumentException(); 385 386 if (p != pEnd) 387 throw lang::IllegalArgumentException(); 388 389 return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix, 390 bOpen, aReversePrefix); 391 } 392 } 393 394 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ 395
