1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 /* 3 * This file is part of the LibreOffice project. 4 * 5 * This Source Code Form is subject to the terms of the Mozilla Public 6 * License, v. 2.0. If a copy of the MPL was not distributed with this 7 * file, You can obtain one at http://mozilla.org/MPL/2.0/. 8 * 9 * This file incorporates work covered by the following license notice: 10 * 11 * Licensed to the Apache Software Foundation (ASF) under one or more 12 * contributor license agreements. See the NOTICE file distributed 13 * with this work for additional information regarding copyright 14 * ownership. The ASF licenses this file to you under the Apache 15 * License, Version 2.0 (the "License"); you may not use this file 16 * except in compliance with the License. You may obtain a copy of 17 * the License at http://www.apache.org/licenses/LICENSE-2.0 . 18 */ 19 20 #include <string.h> 21 22 #include <cassert> 23 #include <set> 24 #include <stack> 25 26 #include <com/sun/star/io/IOException.hpp> 27 #include <com/sun/star/lang/WrappedTargetRuntimeException.hpp> 28 #include <com/sun/star/lang/XServiceInfo.hpp> 29 #include <com/sun/star/uno/XComponentContext.hpp> 30 #include <com/sun/star/util/XCloneable.hpp> 31 #include <com/sun/star/xml/sax/XParser.hpp> 32 #include <com/sun/star/xml/sax/SAXParseException.hpp> 33 #include <com/sun/star/xml/sax/SAXInvalidCharacterException.hpp> 34 #include <com/sun/star/xml/sax/XWriter.hpp> 35 36 #include <com/sun/star/io/XActiveDataSource.hpp> 37 38 #include <cppuhelper/weak.hxx> 39 #include <cppuhelper/implbase.hxx> 40 #include <cppuhelper/supportsservice.hxx> 41 42 #include <osl/diagnose.h> 43 #include <rtl/character.hxx> 44 #include <rtl/ref.hxx> 45 #include <rtl/ustrbuf.hxx> 46 47 using namespace ::std; 48 using namespace ::osl; 49 using namespace ::cppu; 50 using namespace ::com::sun::star::uno; 51 using namespace ::com::sun::star::lang; 52 using namespace ::com::sun::star::xml::sax; 53 using namespace ::com::sun::star::util; 54 using namespace ::com::sun::star::io; 55 56 #include <xml2utf.hxx> 57 #include <memory> 58 59 #define LINEFEED 10 60 #define SEQUENCESIZE 1024 61 #define MAXCOLUMNCOUNT 72 62 63 /****** 64 * 65 * 66 * Character conversion functions 67 * 68 * 69 *****/ 70 71 namespace { 72 73 enum SaxInvalidCharacterError 74 { 75 SAX_NONE, 76 SAX_WARNING, 77 SAX_ERROR 78 }; 79 80 class SaxWriterHelper 81 { 82 #ifdef DBG_UTIL 83 public: 84 ::std::stack<OUString> m_DebugStartedElements; 85 #endif 86 87 private: 88 Reference< XOutputStream > m_out; 89 Sequence < sal_Int8 > m_Sequence; 90 sal_Int8* mp_Sequence; 91 92 sal_Int32 nLastLineFeedPos; // is negative after writing a sequence 93 sal_uInt32 nCurrentPos; 94 bool m_bStartElementFinished; 95 96 /// @throws SAXException 97 inline sal_uInt32 writeSequence(); 98 99 // use only if to insert the bytes more space in the sequence is needed and 100 // so the sequence has to write out and reset rPos to 0 101 // writes sequence only on overflow, sequence could be full on the end (rPos == SEQUENCESIZE) 102 /// @throws SAXException 103 inline void AddBytes(sal_Int8* pTarget, sal_uInt32& rPos, 104 const sal_Int8* pBytes, sal_uInt32 nBytesCount); 105 /// @throws SAXException 106 inline bool convertToXML(const sal_Unicode * pStr, 107 sal_Int32 nStrLen, 108 bool bDoNormalization, 109 bool bNormalizeWhitespace, 110 sal_Int8 *pTarget, 111 sal_uInt32& rPos); 112 /// @throws SAXException 113 inline void FinishStartElement(); 114 public: 115 explicit SaxWriterHelper(Reference< XOutputStream > const & m_TempOut) 116 : m_out(m_TempOut) 117 , m_Sequence(SEQUENCESIZE) 118 , mp_Sequence(nullptr) 119 , nLastLineFeedPos(0) 120 , nCurrentPos(0) 121 , m_bStartElementFinished(true) 122 { 123 OSL_ENSURE(SEQUENCESIZE > 50, "Sequence cache size to small"); 124 mp_Sequence = m_Sequence.getArray(); 125 } 126 ~SaxWriterHelper() 127 { 128 OSL_ENSURE(!nCurrentPos, "cached Sequence not written"); 129 OSL_ENSURE(m_bStartElementFinished, "StartElement not completely written"); 130 } 131 132 /// @throws SAXException 133 inline void insertIndentation(sal_uInt32 m_nLevel); 134 135 // returns whether it works correct or invalid characters were in the string 136 // If there are invalid characters in the string it returns sal_False. 137 // Than the calling method has to throw the needed Exception. 138 /// @throws SAXException 139 inline bool writeString(const OUString& rWriteOutString, 140 bool bDoNormalization, 141 bool bNormalizeWhitespace); 142 143 sal_uInt32 GetLastColumnCount() const throw() 144 { return static_cast<sal_uInt32>(nCurrentPos - nLastLineFeedPos); } 145 146 /// @throws SAXException 147 inline void startDocument(); 148 149 // returns whether it works correct or invalid characters were in the strings 150 // If there are invalid characters in one of the strings it returns sal_False. 151 // Than the calling method has to throw the needed Exception. 152 /// @throws SAXException 153 inline SaxInvalidCharacterError startElement(const OUString& rName, const Reference< XAttributeList >& xAttribs); 154 /// @throws SAXException 155 inline bool FinishEmptyElement(); 156 157 // returns whether it works correct or invalid characters were in the string 158 // If there are invalid characters in the string it returns sal_False. 159 // Than the calling method has to throw the needed Exception. 160 /// @throws SAXException 161 inline bool endElement(const OUString& rName); 162 /// @throws SAXException 163 inline void endDocument(); 164 165 // returns whether it works correct or invalid characters were in the strings 166 // If there are invalid characters in the string it returns sal_False. 167 // Than the calling method has to throw the needed Exception. 168 /// @throws SAXException 169 inline bool processingInstruction(const OUString& rTarget, const OUString& rData); 170 /// @throws SAXException 171 inline void startCDATA(); 172 /// @throws SAXException 173 inline void endCDATA(); 174 175 // returns whether it works correct or invalid characters were in the strings 176 // If there are invalid characters in the string it returns sal_False. 177 // Than the calling method has to throw the needed Exception. 178 /// @throws SAXException 179 inline bool comment(const OUString& rComment); 180 181 /// @throws SAXException 182 inline void clearBuffer(); 183 }; 184 185 const bool g_bValidCharsBelow32[32] = 186 { 187 // 0 1 2 3 4 5 6 7 188 false,false,false,false,false,false,false,false, //0 189 false,true, true, false,false,true, false,false, //8 190 false,false,false,false,false,false,false,false, //16 191 false,false,false,false,false,false,false,false 192 }; 193 194 inline bool IsInvalidChar(const sal_Unicode aChar) 195 { 196 bool bRet(false); 197 // check first for the most common characters 198 if( aChar < 32 || aChar >= 0xd800 ) 199 bRet = ( (aChar < 32 && ! g_bValidCharsBelow32[aChar]) || 200 aChar == 0xffff || 201 aChar == 0xfffe ); 202 return bRet; 203 } 204 205 /******** 206 * write through to the output stream 207 * 208 *****/ 209 inline sal_uInt32 SaxWriterHelper::writeSequence() 210 { 211 try 212 { 213 m_out->writeBytes( m_Sequence ); 214 } 215 catch (const IOException & e) 216 { 217 Any a; 218 a <<= e; 219 throw SAXException( 220 "IO exception during writing", 221 Reference< XInterface > (), 222 a ); 223 } 224 nLastLineFeedPos -= SEQUENCESIZE; 225 return 0; 226 } 227 228 inline void SaxWriterHelper::AddBytes(sal_Int8* pTarget, sal_uInt32& rPos, 229 const sal_Int8* pBytes, sal_uInt32 nBytesCount) 230 { 231 OSL_ENSURE((rPos + nBytesCount) > SEQUENCESIZE, "wrong use of AddBytesMethod"); 232 sal_uInt32 nCount(SEQUENCESIZE - rPos); 233 memcpy( &(pTarget[rPos]) , pBytes, nCount); 234 235 OSL_ENSURE(rPos + nCount == SEQUENCESIZE, "the position should be the at the end"); 236 237 rPos = writeSequence(); 238 sal_uInt32 nRestCount(nBytesCount - nCount); 239 if ((rPos + nRestCount) <= SEQUENCESIZE) 240 { 241 memcpy( &(pTarget[rPos]), &pBytes[nCount], nRestCount); 242 rPos += nRestCount; 243 } 244 else 245 AddBytes(pTarget, rPos, &pBytes[nCount], nRestCount); 246 } 247 248 /** Converts an UTF16 string to UTF8 and does XML normalization 249 250 @param pTarget 251 Pointer to a piece of memory, to where the output should be written. The caller 252 must call calcXMLByteLength on the same string, to ensure, 253 that there is enough memory for converting. 254 */ 255 inline bool SaxWriterHelper::convertToXML( const sal_Unicode * pStr, 256 sal_Int32 nStrLen, 257 bool bDoNormalization, 258 bool bNormalizeWhitespace, 259 sal_Int8 *pTarget, 260 sal_uInt32& rPos ) 261 { 262 bool bRet(true); 263 sal_uInt32 nSurrogate = 0; 264 265 for( sal_Int32 i = 0 ; i < nStrLen ; i ++ ) 266 { 267 sal_uInt16 c = pStr[i]; 268 if (IsInvalidChar(c)) 269 bRet = false; 270 else if( (c >= 0x0001) && (c <= 0x007F) ) 271 { 272 if( bDoNormalization ) 273 { 274 switch( c ) 275 { 276 case '&': // resemble to & 277 { 278 if ((rPos + 5) > SEQUENCESIZE) 279 AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const *>("&"), 5); 280 else 281 { 282 memcpy( &(pTarget[rPos]) , "&", 5 ); 283 rPos += 5; 284 } 285 } 286 break; 287 case '<': 288 { 289 if ((rPos + 4) > SEQUENCESIZE) 290 AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const *>("<"), 4); 291 else 292 { 293 memcpy( &(pTarget[rPos]) , "<" , 4 ); 294 rPos += 4; // < 295 } 296 } 297 break; 298 case '>': 299 { 300 if ((rPos + 4) > SEQUENCESIZE) 301 AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const *>(">"), 4); 302 else 303 { 304 memcpy( &(pTarget[rPos]) , ">" , 4 ); 305 rPos += 4; // > 306 } 307 } 308 break; 309 case 39: // 39 == ''' 310 { 311 if ((rPos + 6) > SEQUENCESIZE) 312 AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const *>("'"), 6); 313 else 314 { 315 memcpy( &(pTarget[rPos]) , "'" , 6 ); 316 rPos += 6; // ' 317 } 318 } 319 break; 320 case '"': 321 { 322 if ((rPos + 6) > SEQUENCESIZE) 323 AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const *>("""), 6); 324 else 325 { 326 memcpy( &(pTarget[rPos]) , """ , 6 ); 327 rPos += 6; // " 328 } 329 } 330 break; 331 case 13: 332 { 333 if ((rPos + 6) > SEQUENCESIZE) 334 AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const *>("
"), 6); 335 else 336 { 337 memcpy( &(pTarget[rPos]) , "
" , 6 ); 338 rPos += 6; 339 } 340 } 341 break; 342 case LINEFEED: 343 { 344 if( bNormalizeWhitespace ) 345 { 346 if ((rPos + 6) > SEQUENCESIZE) 347 AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const *>("
"), 6); 348 else 349 { 350 memcpy( &(pTarget[rPos]) , "
" , 6 ); 351 rPos += 6; 352 } 353 } 354 else 355 { 356 pTarget[rPos] = LINEFEED; 357 nLastLineFeedPos = rPos; 358 rPos ++; 359 } 360 } 361 break; 362 case 9: 363 { 364 if( bNormalizeWhitespace ) 365 { 366 if ((rPos + 6) > SEQUENCESIZE) 367 AddBytes(pTarget, rPos, reinterpret_cast<sal_Int8 const *>("	"), 6); 368 else 369 { 370 memcpy( &(pTarget[rPos]) , "	" , 6 ); 371 rPos += 6; 372 } 373 } 374 else 375 { 376 pTarget[rPos] = 9; 377 rPos ++; 378 } 379 } 380 break; 381 default: 382 { 383 pTarget[rPos] = static_cast<sal_Int8>(c); 384 rPos ++; 385 } 386 break; 387 } 388 } 389 else 390 { 391 pTarget[rPos] = static_cast<sal_Int8>(c); 392 if (static_cast<sal_Int8>(c) == LINEFEED) 393 nLastLineFeedPos = rPos; 394 rPos ++; 395 } 396 } 397 else if( c >= 0xd800 && c < 0xdc00 ) 398 { 399 // 1. surrogate: save (until 2. surrogate) 400 OSL_ENSURE( nSurrogate == 0, "left-over Unicode surrogate" ); 401 nSurrogate = ( ( c & 0x03ff ) + 0x0040 ); 402 } 403 else if( c >= 0xdc00 && c < 0xe000 ) 404 { 405 // 2. surrogate: write as UTF-8 406 OSL_ENSURE( nSurrogate != 0, "lone 2nd Unicode surrogate" ); 407 408 nSurrogate = ( nSurrogate << 10 ) | ( c & 0x03ff ); 409 if( rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000 ) 410 { 411 sal_Int8 aBytes[] = { sal_Int8(0xF0 | ((nSurrogate >> 18) & 0x0F)), 412 sal_Int8(0x80 | ((nSurrogate >> 12) & 0x3F)), 413 sal_Int8(0x80 | ((nSurrogate >> 6) & 0x3F)), 414 sal_Int8(0x80 | ((nSurrogate >> 0) & 0x3F)) }; 415 if ((rPos + 4) > SEQUENCESIZE) 416 AddBytes(pTarget, rPos, aBytes, 4); 417 else 418 { 419 pTarget[rPos] = aBytes[0]; 420 rPos ++; 421 pTarget[rPos] = aBytes[1]; 422 rPos ++; 423 pTarget[rPos] = aBytes[2]; 424 rPos ++; 425 pTarget[rPos] = aBytes[3]; 426 rPos ++; 427 } 428 } 429 else 430 { 431 OSL_FAIL( "illegal Unicode character" ); 432 bRet = false; 433 } 434 435 // reset surrogate 436 nSurrogate = 0; 437 } 438 else if( c > 0x07FF ) 439 { 440 sal_Int8 aBytes[] = { sal_Int8(0xE0 | ((c >> 12) & 0x0F)), 441 sal_Int8(0x80 | ((c >> 6) & 0x3F)), 442 sal_Int8(0x80 | ((c >> 0) & 0x3F)) }; 443 if ((rPos + 3) > SEQUENCESIZE) 444 AddBytes(pTarget, rPos, aBytes, 3); 445 else 446 { 447 pTarget[rPos] = aBytes[0]; 448 rPos ++; 449 pTarget[rPos] = aBytes[1]; 450 rPos ++; 451 pTarget[rPos] = aBytes[2]; 452 rPos ++; 453 } 454 } 455 else 456 { 457 sal_Int8 aBytes[] = { sal_Int8(0xC0 | ((c >> 6) & 0x1F)), 458 sal_Int8(0x80 | ((c >> 0) & 0x3F)) }; 459 if ((rPos + 2) > SEQUENCESIZE) 460 AddBytes(pTarget, rPos, aBytes, 2); 461 else 462 { 463 pTarget[rPos] = aBytes[0]; 464 rPos ++; 465 pTarget[rPos] = aBytes[1]; 466 rPos ++; 467 } 468 } 469 OSL_ENSURE(rPos <= SEQUENCESIZE, "not reset current position"); 470 if (rPos == SEQUENCESIZE) 471 rPos = writeSequence(); 472 473 // reset left-over surrogate 474 if( ( nSurrogate != 0 ) && !( c >= 0xd800 && c < 0xdc00 ) ) 475 { 476 OSL_ENSURE( nSurrogate != 0, "left-over Unicode surrogate" ); 477 nSurrogate = 0; 478 bRet = false; 479 } 480 } 481 return bRet; 482 } 483 484 inline void SaxWriterHelper::FinishStartElement() 485 { 486 if (!m_bStartElementFinished) 487 { 488 mp_Sequence[nCurrentPos] = '>'; 489 nCurrentPos++; 490 if (nCurrentPos == SEQUENCESIZE) 491 nCurrentPos = writeSequence(); 492 m_bStartElementFinished = true; 493 } 494 } 495 496 inline void SaxWriterHelper::insertIndentation(sal_uInt32 m_nLevel) 497 { 498 FinishStartElement(); 499 if (m_nLevel > 0) 500 { 501 if ((nCurrentPos + m_nLevel + 1) <= SEQUENCESIZE) 502 { 503 mp_Sequence[nCurrentPos] = LINEFEED; 504 nLastLineFeedPos = nCurrentPos; 505 nCurrentPos++; 506 memset( &(mp_Sequence[nCurrentPos]) , 32 , m_nLevel ); 507 nCurrentPos += m_nLevel; 508 if (nCurrentPos == SEQUENCESIZE) 509 nCurrentPos = writeSequence(); 510 } 511 else 512 { 513 sal_uInt32 nCount(m_nLevel + 1); 514 std::unique_ptr<sal_Int8[]> pBytes(new sal_Int8[nCount]); 515 pBytes[0] = LINEFEED; 516 memset( &(pBytes[1]), 32, m_nLevel ); 517 AddBytes(mp_Sequence, nCurrentPos, pBytes.get(), nCount); 518 pBytes.reset(); 519 nLastLineFeedPos = nCurrentPos - nCount; 520 if (nCurrentPos == SEQUENCESIZE) 521 nCurrentPos = writeSequence(); 522 } 523 } 524 else 525 { 526 mp_Sequence[nCurrentPos] = LINEFEED; 527 nLastLineFeedPos = nCurrentPos; 528 nCurrentPos++; 529 if (nCurrentPos == SEQUENCESIZE) 530 nCurrentPos = writeSequence(); 531 } 532 } 533 534 inline bool SaxWriterHelper::writeString( const OUString& rWriteOutString, 535 bool bDoNormalization, 536 bool bNormalizeWhitespace ) 537 { 538 FinishStartElement(); 539 return convertToXML(rWriteOutString.getStr(), 540 rWriteOutString.getLength(), 541 bDoNormalization, 542 bNormalizeWhitespace, 543 mp_Sequence, 544 nCurrentPos); 545 } 546 547 inline void SaxWriterHelper::startDocument() 548 { 549 const char pc[] = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"; 550 const int nLen = strlen( pc ); 551 if ((nCurrentPos + nLen) <= SEQUENCESIZE) 552 { 553 memcpy( mp_Sequence, pc , nLen ); 554 nCurrentPos += nLen; 555 } 556 else 557 { 558 AddBytes(mp_Sequence, nCurrentPos, reinterpret_cast<sal_Int8 const *>(pc), nLen); 559 } 560 OSL_ENSURE(nCurrentPos <= SEQUENCESIZE, "not reset current position"); 561 if (nCurrentPos == SEQUENCESIZE) 562 nCurrentPos = writeSequence(); 563 mp_Sequence[nCurrentPos] = LINEFEED; 564 nCurrentPos++; 565 if (nCurrentPos == SEQUENCESIZE) 566 nCurrentPos = writeSequence(); 567 } 568 569 void CheckValidName(OUString const& rName) 570 { 571 #ifdef NDEBUG 572 (void) rName; 573 #else 574 assert(!rName.isEmpty()); 575 bool hasColon(false); 576 for (sal_Int32 i = 0; i < rName.getLength(); ++i) 577 { 578 auto const c(rName[i]); 579 if (c == ':') 580 { 581 if (hasColon) 582 assert("only one colon allowed"); 583 else 584 hasColon = true; 585 } 586 else if (!rtl::isAsciiAlphanumeric(c) && c != '_' && c != '-' && c != '.') 587 { // note: this will also warn about non-ASCII characters which 588 // are allowed by XML but surely unexpected in LO filters 589 // (OTOH we don't warn about invalid start chars) 590 assert(!"unexpected character in attribute name"); 591 } 592 } 593 #endif 594 } 595 596 inline SaxInvalidCharacterError SaxWriterHelper::startElement(const OUString& rName, const Reference< XAttributeList >& xAttribs) 597 { 598 FinishStartElement(); 599 600 #ifdef DBG_UTIL 601 m_DebugStartedElements.push(rName); 602 ::std::set<OUString> DebugAttributes; 603 #endif 604 605 mp_Sequence[nCurrentPos] = '<'; 606 nCurrentPos++; 607 if (nCurrentPos == SEQUENCESIZE) 608 nCurrentPos = writeSequence(); 609 610 SaxInvalidCharacterError eRet(SAX_NONE); 611 CheckValidName(rName); 612 if (!writeString(rName, false, false)) 613 eRet = SAX_ERROR; 614 615 sal_Int16 nAttribCount = xAttribs.is() ? static_cast<sal_Int16>(xAttribs->getLength()) : 0; 616 for(sal_Int16 i = 0 ; i < nAttribCount ; i++ ) 617 { 618 mp_Sequence[nCurrentPos] = ' '; 619 nCurrentPos++; 620 if (nCurrentPos == SEQUENCESIZE) 621 nCurrentPos = writeSequence(); 622 623 OUString const& rAttrName(xAttribs->getNameByIndex(i)); 624 #ifdef DBG_UTIL 625 // Well-formedness constraint: Unique Att Spec 626 assert(DebugAttributes.find(rAttrName) == DebugAttributes.end()); 627 DebugAttributes.insert(rAttrName); 628 #endif 629 CheckValidName(rAttrName); 630 if (!writeString(rAttrName, false, false)) 631 eRet = SAX_ERROR; 632 633 mp_Sequence[nCurrentPos] = '='; 634 nCurrentPos++; 635 if (nCurrentPos == SEQUENCESIZE) 636 nCurrentPos = writeSequence(); 637 mp_Sequence[nCurrentPos] = '"'; 638 nCurrentPos++; 639 if (nCurrentPos == SEQUENCESIZE) 640 nCurrentPos = writeSequence(); 641 642 if (!writeString(xAttribs->getValueByIndex( i ), true, true) && 643 eRet != SAX_ERROR) 644 eRet = SAX_WARNING; 645 646 mp_Sequence[nCurrentPos] = '"'; 647 nCurrentPos++; 648 if (nCurrentPos == SEQUENCESIZE) 649 nCurrentPos = writeSequence(); 650 } 651 652 m_bStartElementFinished = false; // because the '>' character is not added, 653 // because it is possible, that the "/>" 654 // characters have to add 655 return eRet; 656 } 657 658 inline bool SaxWriterHelper::FinishEmptyElement() 659 { 660 if (m_bStartElementFinished) 661 return false; 662 663 mp_Sequence[nCurrentPos] = '/'; 664 nCurrentPos++; 665 if (nCurrentPos == SEQUENCESIZE) 666 nCurrentPos = writeSequence(); 667 mp_Sequence[nCurrentPos] = '>'; 668 nCurrentPos++; 669 if (nCurrentPos == SEQUENCESIZE) 670 nCurrentPos = writeSequence(); 671 672 m_bStartElementFinished = true; 673 674 return true; 675 } 676 677 inline bool SaxWriterHelper::endElement(const OUString& rName) 678 { 679 FinishStartElement(); 680 681 mp_Sequence[nCurrentPos] = '<'; 682 nCurrentPos++; 683 if (nCurrentPos == SEQUENCESIZE) 684 nCurrentPos = writeSequence(); 685 mp_Sequence[nCurrentPos] = '/'; 686 nCurrentPos++; 687 if (nCurrentPos == SEQUENCESIZE) 688 nCurrentPos = writeSequence(); 689 690 CheckValidName(rName); 691 bool bRet(writeString( rName, false, false)); 692 693 mp_Sequence[nCurrentPos] = '>'; 694 nCurrentPos++; 695 if (nCurrentPos == SEQUENCESIZE) 696 nCurrentPos = writeSequence(); 697 698 return bRet; 699 } 700 701 inline void SaxWriterHelper::endDocument() 702 { 703 if (nCurrentPos > 0) 704 { 705 m_Sequence.realloc(nCurrentPos); 706 nCurrentPos = writeSequence(); 707 //m_Sequence.realloc(SEQUENCESIZE); 708 } 709 } 710 711 inline void SaxWriterHelper::clearBuffer() 712 { 713 FinishStartElement(); 714 if (nCurrentPos > 0) 715 { 716 m_Sequence.realloc(nCurrentPos); 717 nCurrentPos = writeSequence(); 718 m_Sequence.realloc(SEQUENCESIZE); 719 // Be sure to update the array pointer after the reallocation. 720 mp_Sequence = m_Sequence.getArray(); 721 } 722 } 723 724 inline bool SaxWriterHelper::processingInstruction(const OUString& rTarget, const OUString& rData) 725 { 726 FinishStartElement(); 727 mp_Sequence[nCurrentPos] = '<'; 728 nCurrentPos++; 729 if (nCurrentPos == SEQUENCESIZE) 730 nCurrentPos = writeSequence(); 731 mp_Sequence[nCurrentPos] = '?'; 732 nCurrentPos++; 733 if (nCurrentPos == SEQUENCESIZE) 734 nCurrentPos = writeSequence(); 735 736 bool bRet(writeString( rTarget, false, false )); 737 738 mp_Sequence[nCurrentPos] = ' '; 739 nCurrentPos++; 740 if (nCurrentPos == SEQUENCESIZE) 741 nCurrentPos = writeSequence(); 742 743 if (!writeString( rData, false, false )) 744 bRet = false; 745 746 mp_Sequence[nCurrentPos] = '?'; 747 nCurrentPos++; 748 if (nCurrentPos == SEQUENCESIZE) 749 nCurrentPos = writeSequence(); 750 mp_Sequence[nCurrentPos] = '>'; 751 nCurrentPos++; 752 if (nCurrentPos == SEQUENCESIZE) 753 nCurrentPos = writeSequence(); 754 755 return bRet; 756 } 757 758 inline void SaxWriterHelper::startCDATA() 759 { 760 FinishStartElement(); 761 if ((nCurrentPos + 9) <= SEQUENCESIZE) 762 { 763 memcpy( &(mp_Sequence[nCurrentPos]), "<![CDATA[" , 9 ); 764 nCurrentPos += 9; 765 } 766 else 767 AddBytes(mp_Sequence, nCurrentPos, reinterpret_cast<sal_Int8 const *>("<![CDATA["), 9); 768 if (nCurrentPos == SEQUENCESIZE) 769 nCurrentPos = writeSequence(); 770 } 771 772 inline void SaxWriterHelper::endCDATA() 773 { 774 FinishStartElement(); 775 if ((nCurrentPos + 3) <= SEQUENCESIZE) 776 { 777 memcpy( &(mp_Sequence[nCurrentPos]), "]]>" , 3 ); 778 nCurrentPos += 3; 779 } 780 else 781 AddBytes(mp_Sequence, nCurrentPos, reinterpret_cast<sal_Int8 const *>("]]>"), 3); 782 if (nCurrentPos == SEQUENCESIZE) 783 nCurrentPos = writeSequence(); 784 } 785 786 inline bool SaxWriterHelper::comment(const OUString& rComment) 787 { 788 FinishStartElement(); 789 mp_Sequence[nCurrentPos] = '<'; 790 nCurrentPos++; 791 if (nCurrentPos == SEQUENCESIZE) 792 nCurrentPos = writeSequence(); 793 mp_Sequence[nCurrentPos] = '!'; 794 nCurrentPos++; 795 if (nCurrentPos == SEQUENCESIZE) 796 nCurrentPos = writeSequence(); 797 mp_Sequence[nCurrentPos] = '-'; 798 nCurrentPos++; 799 if (nCurrentPos == SEQUENCESIZE) 800 nCurrentPos = writeSequence(); 801 mp_Sequence[nCurrentPos] = '-'; 802 nCurrentPos++; 803 if (nCurrentPos == SEQUENCESIZE) 804 nCurrentPos = writeSequence(); 805 806 bool bRet(writeString( rComment, false, false)); 807 808 mp_Sequence[nCurrentPos] = '-'; 809 nCurrentPos++; 810 if (nCurrentPos == SEQUENCESIZE) 811 nCurrentPos = writeSequence(); 812 mp_Sequence[nCurrentPos] = '-'; 813 nCurrentPos++; 814 if (nCurrentPos == SEQUENCESIZE) 815 nCurrentPos = writeSequence(); 816 mp_Sequence[nCurrentPos] = '>'; 817 nCurrentPos++; 818 if (nCurrentPos == SEQUENCESIZE) 819 nCurrentPos = writeSequence(); 820 821 return bRet; 822 } 823 824 inline sal_Int32 calcXMLByteLength( const OUString& rStr, 825 bool bDoNormalization, 826 bool bNormalizeWhitespace ) 827 { 828 sal_Int32 nOutputLength = 0; 829 sal_uInt32 nSurrogate = 0; 830 831 const sal_Unicode *pStr = rStr.getStr(); 832 sal_Int32 nStrLen = rStr.getLength(); 833 for( sal_Int32 i = 0 ; i < nStrLen ; i++ ) 834 { 835 sal_uInt16 c = pStr[i]; 836 if( !IsInvalidChar(c) && (c >= 0x0001) && (c <= 0x007F) ) 837 { 838 if( bDoNormalization ) 839 { 840 switch( c ) 841 { 842 case '&': // resemble to & 843 nOutputLength +=5; 844 break; 845 case '<': // < 846 case '>': // > 847 nOutputLength +=4; 848 break; 849 case 39: // 39 == ''', ' 850 case '"': // " 851 case 13: // 
 852 nOutputLength += 6; 853 break; 854 855 case 10: // 
 856 case 9: // 	 857 if( bNormalizeWhitespace ) 858 { 859 nOutputLength += 6; 860 } 861 else 862 { 863 nOutputLength ++; 864 } 865 break; 866 default: 867 nOutputLength ++; 868 } 869 } 870 else 871 { 872 nOutputLength ++; 873 } 874 } 875 else if( c >= 0xd800 && c < 0xdc00 ) 876 { 877 // save surrogate 878 nSurrogate = ( ( c & 0x03ff ) + 0x0040 ); 879 } 880 else if( c >= 0xdc00 && c < 0xe000 ) 881 { 882 // 2. surrogate: write as UTF-8 (if range is OK 883 nSurrogate = ( nSurrogate << 10 ) | ( c & 0x03ff ); 884 if( rtl::isUnicodeScalarValue(nSurrogate) && nSurrogate >= 0x00010000 ) 885 nOutputLength += 4; 886 nSurrogate = 0; 887 } 888 else if( c > 0x07FF ) 889 { 890 nOutputLength += 3; 891 } 892 else 893 { 894 nOutputLength += 2; 895 } 896 897 // surrogate processing 898 if( ( nSurrogate != 0 ) && !( c >= 0xd800 && c < 0xdc00 ) ) 899 nSurrogate = 0; 900 } 901 902 return nOutputLength; 903 } 904 905 /** returns position of first ascii 10 within the string, -1 when no 10 in string. 906 */ 907 inline sal_Int32 getFirstLineBreak( const OUString & str ) throw () 908 { 909 const sal_Unicode *pSource = str.getStr(); 910 sal_Int32 nLen = str.getLength(); 911 912 for( int n = 0; n < nLen ; n ++ ) 913 { 914 if( LINEFEED == pSource[n] ) { 915 return n; 916 } 917 } 918 return -1; 919 } 920 921 class SAXWriter : 922 public WeakImplHelper< 923 XWriter, 924 XServiceInfo > 925 { 926 public: 927 SAXWriter() 928 : m_pSaxWriterHelper(nullptr) 929 , m_bDocStarted(false) 930 , m_bIsCDATA(false) 931 , m_bForceLineBreak(false) 932 , m_bAllowLineBreak(false) 933 , m_nLevel(0) 934 { 935 } 936 937 public: // XActiveDataSource 938 virtual void SAL_CALL setOutputStream(const Reference< XOutputStream > & aStream) override 939 { 940 try 941 { 942 // temporary: set same stream again to clear buffer 943 if ( m_out == aStream && m_pSaxWriterHelper && m_bDocStarted ) 944 m_pSaxWriterHelper->clearBuffer(); 945 else 946 { 947 m_out = aStream; 948 m_pSaxWriterHelper.reset( new SaxWriterHelper(m_out) ); 949 m_bDocStarted = false; 950 m_nLevel = 0; 951 m_bIsCDATA = false; 952 } 953 } 954 catch (const SAXException& e) 955 { 956 throw css::lang::WrappedTargetRuntimeException( 957 e.Message, 958 static_cast < OWeakObject * > ( this ), 959 e.WrappedException); 960 } 961 } 962 virtual Reference< XOutputStream > SAL_CALL getOutputStream() override 963 { 964 return m_out; 965 } 966 967 public: // XDocumentHandler 968 virtual void SAL_CALL startDocument() override; 969 970 virtual void SAL_CALL endDocument() override; 971 972 virtual void SAL_CALL startElement(const OUString& aName, 973 const Reference< XAttributeList > & xAttribs) override; 974 975 virtual void SAL_CALL endElement(const OUString& aName) override; 976 977 virtual void SAL_CALL characters(const OUString& aChars) override; 978 979 virtual void SAL_CALL ignorableWhitespace(const OUString& aWhitespaces) override; 980 virtual void SAL_CALL processingInstruction(const OUString& aTarget, 981 const OUString& aData) override; 982 virtual void SAL_CALL setDocumentLocator(const Reference< XLocator > & xLocator) override; 983 984 public: // XExtendedDocumentHandler 985 virtual void SAL_CALL startCDATA() override; 986 virtual void SAL_CALL endCDATA() override; 987 virtual void SAL_CALL comment(const OUString& sComment) override; 988 virtual void SAL_CALL unknown(const OUString& sString) override; 989 virtual void SAL_CALL allowLineBreak() override; 990 991 public: // XServiceInfo 992 OUString SAL_CALL getImplementationName() override; 993 Sequence< OUString > SAL_CALL getSupportedServiceNames() override; 994 sal_Bool SAL_CALL supportsService(const OUString& ServiceName) override; 995 996 private: 997 sal_Int32 getIndentPrefixLength( sal_Int32 nFirstLineBreakOccurrence ) throw(); 998 999 Reference< XOutputStream > m_out; 1000 std::unique_ptr<SaxWriterHelper> m_pSaxWriterHelper; 1001 1002 // Status information 1003 bool m_bDocStarted : 1; 1004 bool m_bIsCDATA : 1; 1005 bool m_bForceLineBreak : 1; 1006 bool m_bAllowLineBreak : 1; 1007 sal_Int32 m_nLevel; 1008 }; 1009 1010 sal_Int32 SAXWriter::getIndentPrefixLength( sal_Int32 nFirstLineBreakOccurrence ) throw() 1011 { 1012 sal_Int32 nLength =-1; 1013 if (m_pSaxWriterHelper) 1014 { 1015 if ( m_bForceLineBreak || 1016 (m_bAllowLineBreak && 1017 ((nFirstLineBreakOccurrence + m_pSaxWriterHelper->GetLastColumnCount()) > MAXCOLUMNCOUNT)) ) 1018 nLength = m_nLevel; 1019 } 1020 m_bForceLineBreak = false; 1021 m_bAllowLineBreak = false; 1022 return nLength; 1023 } 1024 1025 inline bool isFirstCharWhitespace( const sal_Unicode *p ) throw() 1026 { 1027 return *p == ' '; 1028 } 1029 1030 // XServiceInfo 1031 OUString SAXWriter::getImplementationName() 1032 { 1033 return OUString("com.sun.star.extensions.xml.sax.Writer"); 1034 } 1035 1036 // XServiceInfo 1037 sal_Bool SAXWriter::supportsService(const OUString& ServiceName) 1038 { 1039 return cppu::supportsService(this, ServiceName); 1040 } 1041 1042 // XServiceInfo 1043 Sequence< OUString > SAXWriter::getSupportedServiceNames() 1044 { 1045 Sequence<OUString> seq { "com.sun.star.xml.sax.Writer" }; 1046 return seq; 1047 } 1048 1049 void SAXWriter::startDocument() 1050 { 1051 if( m_bDocStarted || ! m_out.is() || !m_pSaxWriterHelper ) { 1052 throw SAXException(); 1053 } 1054 m_bDocStarted = true; 1055 m_pSaxWriterHelper->startDocument(); 1056 } 1057 1058 1059 void SAXWriter::endDocument() 1060 { 1061 if( ! m_bDocStarted ) 1062 { 1063 throw SAXException( 1064 "endDocument called before startDocument", 1065 Reference< XInterface >() , Any() ); 1066 } 1067 if( m_nLevel ) { 1068 throw SAXException( 1069 "unexpected end of document", 1070 Reference< XInterface >() , Any() ); 1071 } 1072 m_pSaxWriterHelper->endDocument(); 1073 try 1074 { 1075 m_out->closeOutput(); 1076 } 1077 catch (const IOException & e) 1078 { 1079 Any a; 1080 a <<= e; 1081 throw SAXException( 1082 "IO exception during closing the IO Stream", 1083 Reference< XInterface > (), 1084 a ); 1085 } 1086 } 1087 1088 1089 void SAXWriter::startElement(const OUString& aName, const Reference< XAttributeList >& xAttribs) 1090 { 1091 if( ! m_bDocStarted ) 1092 { 1093 SAXException except; 1094 except.Message = "startElement called before startDocument"; 1095 throw except; 1096 } 1097 if( m_bIsCDATA ) 1098 { 1099 SAXException except; 1100 except.Message = "startElement call not allowed with CDATA sections"; 1101 throw except; 1102 } 1103 1104 sal_Int32 nLength(0); 1105 if (m_bAllowLineBreak) 1106 { 1107 sal_Int32 nAttribCount = xAttribs.is() ? xAttribs->getLength() : 0; 1108 1109 nLength ++; // "<" 1110 nLength += calcXMLByteLength( aName, false, false ); // the tag name 1111 1112 sal_Int16 n; 1113 for( n = 0 ; n < static_cast<sal_Int16>(nAttribCount) ; n ++ ) { 1114 nLength ++; // " " 1115 OUString tmp = xAttribs->getNameByIndex( n ); 1116 1117 nLength += calcXMLByteLength( tmp, false, false ); 1118 1119 nLength += 2; // =" 1120 1121 tmp = xAttribs->getValueByIndex( n ); 1122 1123 nLength += calcXMLByteLength( tmp, true, true ); 1124 1125 nLength += 1; // " 1126 } 1127 1128 nLength ++; // '>' 1129 } 1130 1131 // Is there a new indentation necessary ? 1132 sal_Int32 nPrefix(getIndentPrefixLength( nLength )); 1133 1134 // write into sequence 1135 if( nPrefix >= 0 ) 1136 m_pSaxWriterHelper->insertIndentation( nPrefix ); 1137 1138 SaxInvalidCharacterError eRet(m_pSaxWriterHelper->startElement(aName, xAttribs)); 1139 1140 m_nLevel++; 1141 1142 if (eRet == SAX_WARNING) 1143 { 1144 SAXInvalidCharacterException except; 1145 except.Message = "Invalid character during XML-Export in a attribute value"; 1146 throw except; 1147 } 1148 else if (eRet == SAX_ERROR) 1149 { 1150 SAXException except; 1151 except.Message = "Invalid character during XML-Export"; 1152 throw except; 1153 } 1154 } 1155 1156 void SAXWriter::endElement(const OUString& aName) 1157 { 1158 if( ! m_bDocStarted ) { 1159 throw SAXException (); 1160 } 1161 m_nLevel --; 1162 1163 if( m_nLevel < 0 ) { 1164 throw SAXException(); 1165 } 1166 bool bRet(true); 1167 1168 // check here because Helper's endElement is not always called 1169 #ifdef DBG_UTIL 1170 assert(!m_pSaxWriterHelper->m_DebugStartedElements.empty()); 1171 // Well-formedness constraint: Element Type Match 1172 assert(aName == m_pSaxWriterHelper->m_DebugStartedElements.top()); 1173 m_pSaxWriterHelper->m_DebugStartedElements.pop(); 1174 #endif 1175 1176 if( m_pSaxWriterHelper->FinishEmptyElement() ) 1177 m_bForceLineBreak = false; 1178 else 1179 { 1180 // only ascii chars allowed 1181 sal_Int32 nLength(0); 1182 if (m_bAllowLineBreak) 1183 nLength = 3 + calcXMLByteLength( aName, false, false ); 1184 sal_Int32 nPrefix = getIndentPrefixLength( nLength ); 1185 1186 if( nPrefix >= 0 ) 1187 m_pSaxWriterHelper->insertIndentation( nPrefix ); 1188 1189 bRet = m_pSaxWriterHelper->endElement(aName); 1190 } 1191 1192 if (!bRet) 1193 { 1194 SAXException except; 1195 except.Message = "Invalid character during XML-Export"; 1196 throw except; 1197 } 1198 } 1199 1200 void SAXWriter::characters(const OUString& aChars) 1201 { 1202 if( ! m_bDocStarted ) 1203 { 1204 SAXException except; 1205 except.Message = "characters method called before startDocument"; 1206 throw except; 1207 } 1208 1209 bool bThrowException(false); 1210 if( !aChars.isEmpty() ) 1211 { 1212 if( m_bIsCDATA ) 1213 bThrowException = !m_pSaxWriterHelper->writeString( aChars, false, false ); 1214 else 1215 { 1216 // Note : nFirstLineBreakOccurrence is not exact, because we don't know, how 1217 // many 2 and 3 byte chars are inbetween. However this whole stuff 1218 // is eitherway for pretty printing only, so it does not need to be exact. 1219 sal_Int32 nLength(0); 1220 sal_Int32 nIndentPrefix(-1); 1221 if (m_bAllowLineBreak) 1222 { 1223 sal_Int32 nFirstLineBreakOccurrence = getFirstLineBreak( aChars ); 1224 1225 nLength = calcXMLByteLength( aChars, ! m_bIsCDATA , false ); 1226 nIndentPrefix = getIndentPrefixLength( 1227 nFirstLineBreakOccurrence >= 0 ? nFirstLineBreakOccurrence : nLength ); 1228 } 1229 else 1230 nIndentPrefix = getIndentPrefixLength(nLength); 1231 1232 // insert indentation 1233 if( nIndentPrefix >= 0 ) 1234 { 1235 if( isFirstCharWhitespace( aChars.getStr() ) ) 1236 m_pSaxWriterHelper->insertIndentation( nIndentPrefix - 1 ); 1237 else 1238 m_pSaxWriterHelper->insertIndentation( nIndentPrefix ); 1239 } 1240 bThrowException = !m_pSaxWriterHelper->writeString(aChars, true , false); 1241 } 1242 } 1243 if (bThrowException) 1244 { 1245 SAXInvalidCharacterException except; 1246 except.Message = "Invalid character during XML-Export"; 1247 throw except; 1248 } 1249 } 1250 1251 1252 void SAXWriter::ignorableWhitespace(const OUString&) 1253 { 1254 if( ! m_bDocStarted ) 1255 { 1256 throw SAXException (); 1257 } 1258 1259 m_bForceLineBreak = true; 1260 } 1261 1262 void SAXWriter::processingInstruction(const OUString& aTarget, const OUString& aData) 1263 { 1264 if( ! m_bDocStarted || m_bIsCDATA ) 1265 { 1266 throw SAXException(); 1267 } 1268 1269 sal_Int32 nLength(0); 1270 if (m_bAllowLineBreak) 1271 { 1272 nLength = 2; // "<?" 1273 nLength += calcXMLByteLength( aTarget, false, false ); 1274 1275 nLength += 1; // " " 1276 1277 nLength += calcXMLByteLength( aData, false, false ); 1278 1279 nLength += 2; // "?>" 1280 } 1281 1282 sal_Int32 nPrefix = getIndentPrefixLength( nLength ); 1283 1284 if( nPrefix >= 0 ) 1285 m_pSaxWriterHelper->insertIndentation( nPrefix ); 1286 1287 if (!m_pSaxWriterHelper->processingInstruction(aTarget, aData)) 1288 { 1289 SAXException except; 1290 except.Message = "Invalid character during XML-Export"; 1291 throw except; 1292 } 1293 } 1294 1295 1296 void SAXWriter::setDocumentLocator(const Reference< XLocator >&) 1297 { 1298 1299 } 1300 1301 void SAXWriter::startCDATA() 1302 { 1303 if( ! m_bDocStarted || m_bIsCDATA) 1304 { 1305 throw SAXException (); 1306 } 1307 1308 sal_Int32 nPrefix = getIndentPrefixLength( 9 ); 1309 if( nPrefix >= 0 ) 1310 m_pSaxWriterHelper->insertIndentation( nPrefix ); 1311 1312 m_pSaxWriterHelper->startCDATA(); 1313 1314 m_bIsCDATA = true; 1315 } 1316 1317 void SAXWriter::endCDATA() 1318 { 1319 if( ! m_bDocStarted || ! m_bIsCDATA) 1320 { 1321 SAXException except; 1322 except.Message = "endCDATA was called without startCDATA"; 1323 throw except; 1324 } 1325 1326 sal_Int32 nPrefix = getIndentPrefixLength( 3 ); 1327 if( nPrefix >= 0 ) 1328 m_pSaxWriterHelper->insertIndentation( nPrefix ); 1329 1330 m_pSaxWriterHelper->endCDATA(); 1331 1332 m_bIsCDATA = false; 1333 } 1334 1335 1336 void SAXWriter::comment(const OUString& sComment) 1337 { 1338 if( ! m_bDocStarted || m_bIsCDATA ) 1339 { 1340 throw SAXException(); 1341 } 1342 1343 sal_Int32 nLength(0); 1344 if (m_bAllowLineBreak) 1345 { 1346 nLength = 4; // "<!--" 1347 nLength += calcXMLByteLength( sComment, false, false); 1348 1349 nLength += 3; 1350 } 1351 1352 sal_Int32 nPrefix = getIndentPrefixLength( nLength ); 1353 if( nPrefix >= 0 ) 1354 m_pSaxWriterHelper->insertIndentation( nPrefix ); 1355 1356 if (!m_pSaxWriterHelper->comment(sComment)) 1357 { 1358 SAXException except; 1359 except.Message = "Invalid character during XML-Export"; 1360 throw except; 1361 } 1362 } 1363 1364 1365 void SAXWriter::allowLineBreak( ) 1366 { 1367 if( ! m_bDocStarted || m_bAllowLineBreak ) { 1368 throw SAXException(); 1369 } 1370 1371 m_bAllowLineBreak = true; 1372 } 1373 1374 void SAXWriter::unknown(const OUString& sString) 1375 { 1376 1377 if( ! m_bDocStarted ) 1378 { 1379 throw SAXException (); 1380 } 1381 if( m_bIsCDATA ) 1382 { 1383 throw SAXException(); 1384 } 1385 1386 if( sString.startsWith( "<?xml" ) ) 1387 return; 1388 1389 sal_Int32 nLength(0); 1390 if (m_bAllowLineBreak) 1391 nLength = calcXMLByteLength( sString, false, false ); 1392 1393 sal_Int32 nPrefix = getIndentPrefixLength( nLength ); 1394 if( nPrefix >= 0 ) 1395 m_pSaxWriterHelper->insertIndentation( nPrefix ); 1396 1397 if (!m_pSaxWriterHelper->writeString( sString, false, false)) 1398 { 1399 SAXException except; 1400 except.Message = "Invalid character during XML-Export"; 1401 throw except; 1402 } 1403 } 1404 1405 } // namespace 1406 1407 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface * 1408 com_sun_star_extensions_xml_sax_Writer_get_implementation( 1409 css::uno::XComponentContext *, 1410 css::uno::Sequence<css::uno::Any> const &) 1411 { 1412 return cppu::acquire(new SAXWriter); 1413 } 1414 1415 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ 1416
