1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 
21 #include <stdio.h>
22 #include <string.h>
23 #include <stdlib.h>
24 #include <errno.h>
25 #include <sal/main.h>
26 #include <sal/types.h>
27 #include <rtl/ustring.hxx>
28 
29 #include <vector>
30 
31 static void make_hhc_char(FILE *sfp, FILE *cfp);
32 static void make_stc_char(FILE *sfp, FILE *cfp);
33 static void make_stc_word(FILE *sfp, FILE *cfp);
34 
35 /* Main Procedure */
36 
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc,argv)37 SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
38 {
39     FILE *sfp, *cfp;
40 
41     if (argc < 4) exit(-1);
42 
43 
44     sfp = fopen(argv[2], "rb"); // open the source file for read;
45     if (sfp == nullptr)
46     {
47         fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
48         exit(1);
49     }
50 
51     // create the C source file to write
52     cfp = fopen(argv[3], "wb");
53     if (cfp == nullptr) {
54         fclose(sfp);
55         fprintf(stderr, "Opening %s for writing failed: %s\n", argv[3], strerror(errno));
56         exit(1);
57     }
58 
59     fprintf(cfp, "/*\n");
60     fprintf(cfp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n");
61     fprintf(cfp, " * All Rights Reserved.\n");
62     fprintf(cfp, " */\n\n");
63     fprintf(cfp, "/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n");
64     fprintf(cfp, "#include <sal/types.h>\n");
65     fprintf(cfp, "#include <textconversion.hxx>\n");
66     fprintf(cfp, "\nextern \"C\" {\n");
67 
68     if (strcmp(argv[1], "hhc_char") == 0)
69         make_hhc_char(sfp, cfp);
70     else if (strcmp(argv[1], "stc_char") == 0)
71         make_stc_char(sfp, cfp);
72     else if (strcmp(argv[1], "stc_word") == 0)
73         make_stc_word(sfp, cfp);
74 
75     fprintf (cfp, "}\n");
76 
77     fclose(sfp);
78     fclose(cfp);
79 
80     return 0;
81 } // end of main
82 
83 // Hangul/Hanja character conversion
make_hhc_char(FILE * sfp,FILE * cfp)84 void make_hhc_char(FILE *sfp, FILE *cfp)
85 {
86     sal_Int32 count, address, i, j, k;
87     sal_Unicode Hanja2HangulData[0x10000];
88     for (i = 0; i < 0x10000; i++) {
89         Hanja2HangulData[i] = 0;
90     }
91     sal_uInt16 Hangul2HanjaData[10000][3];
92 
93     // generate main dict. data array
94     fprintf(cfp, "\nstatic const sal_Unicode Hangul2HanjaData[] = {");
95 
96     char Cstr[1024];
97     count = 0;
98     address = 0;
99     while (fgets(Cstr, 1024, sfp)) {
100         // input file is in UTF-8 encoding (Hangul:Hanja)
101         // don't convert last new line character to Ostr.
102         OUString Ostr(Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
103         sal_Int32  len = Ostr.getLength();
104 
105         Hangul2HanjaData[count][0] = Ostr[0];
106         Hangul2HanjaData[count][1] = sal::static_int_cast<sal_uInt16>( address );
107         Hangul2HanjaData[count][2] = sal::static_int_cast<sal_uInt16>( len - 2 );
108         count++;
109 
110         for (i = 2; i < len; i++) {
111             Hanja2HangulData[Ostr[i]] = Ostr[0];
112             if (address++ % 16 == 0)
113                 fprintf(cfp, "\n\t");
114             fprintf(cfp, "0x%04x, ", Ostr[i]);
115         }
116     }
117     fprintf(cfp, "\n};\n");
118 
119     fprintf(cfp, "\nstatic const i18npool::Hangul_Index Hangul2HanjaIndex[] = {\n");
120     for (i = 0; i < count; i++)
121         fprintf(cfp, "\t{ 0x%04x, 0x%04x, 0x%02x },\n",
122                         Hangul2HanjaData[i][0],
123                         Hangul2HanjaData[i][1],
124                         Hangul2HanjaData[i][2]);
125     fprintf(cfp, "};\n");
126 
127     fprintf(cfp, "\nstatic const sal_uInt16 Hanja2HangulIndex[] = {");
128 
129     address=0;
130     for (i = 0; i < 0x10; i++) {
131         fprintf(cfp, "\n\t");
132         for (j = 0; j < 0x10; j++) {
133             for (k = 0; k < 0x100; k++) {
134                 if (Hanja2HangulData[((i*0x10)+j)*0x100+k] != 0)
135                     break;
136             }
137             fprintf(
138                 cfp, "0x%04lx, ",
139                 sal::static_int_cast< unsigned long >(
140                     k < 0x100 ? (address++)*0x100 : 0xFFFF));
141         }
142     }
143     fprintf(cfp, "\n};\n");
144 
145     fprintf(cfp, "\nstatic const sal_Unicode Hanja2HangulData[] = {");
146 
147     for (i = 0; i < 0x100; i++) {
148         for (j = 0; j < 0x100; j++) {
149             if (Hanja2HangulData[i*0x100+j] != 0)
150                 break;
151         }
152         if (j < 0x100) {
153             for (j = 0; j < 0x10; j++) {
154                 fprintf(cfp, "\n\t");
155                 for (k = 0; k < 0x10; k++) {
156                     sal_Unicode c = Hanja2HangulData[((i*0x10+j)*0x10)+k];
157                     fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
158                 }
159             }
160         }
161     }
162     fprintf(cfp, "\n};\n");
163 
164     // create function to return arrays
165     fprintf (cfp, "\tconst sal_Unicode* getHangul2HanjaData() { return Hangul2HanjaData; }\n");
166     fprintf (cfp, "\tconst i18npool::Hangul_Index* getHangul2HanjaIndex() { return Hangul2HanjaIndex; }\n");
167     fprintf (cfp, "\tsal_Int16 getHangul2HanjaIndexCount() { return sizeof(Hangul2HanjaIndex) / sizeof(i18npool::Hangul_Index); }\n");
168     fprintf (cfp, "\tconst sal_uInt16* getHanja2HangulIndex() { return Hanja2HangulIndex; }\n");
169     fprintf (cfp, "\tconst sal_Unicode* getHanja2HangulData() { return Hanja2HangulData; }\n");
170 }
171 
172 // Simplified/Traditional Chinese character conversion
make_stc_char(FILE * sfp,FILE * cfp)173 void make_stc_char(FILE *sfp, FILE *cfp)
174 {
175     sal_Int32 address, i, j, k;
176     sal_Unicode SChinese2TChineseData[0x10000];
177     sal_Unicode SChinese2VChineseData[0x10000];
178     sal_Unicode TChinese2SChineseData[0x10000];
179     for (i = 0; i < 0x10000; i++) {
180         SChinese2TChineseData[i] = 0;
181         SChinese2VChineseData[i] = 0;
182         TChinese2SChineseData[i] = 0;
183     }
184 
185     char Cstr[1024];
186     while (fgets(Cstr, 1024, sfp)) {
187         // input file is in UTF-8 encoding (SChinese:TChinese)
188         // don't convert last new line character to Ostr.
189         OUString Ostr(Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
190         sal_Int32  len = Ostr.getLength();
191         if (Ostr[1] == 'v')
192             SChinese2VChineseData[Ostr[0]] = Ostr[2];
193         else {
194             SChinese2TChineseData[Ostr[0]] = Ostr[2];
195             if (SChinese2VChineseData[Ostr[0]] == 0)
196                 SChinese2VChineseData[Ostr[0]] = Ostr[2];
197         }
198         for (i = 2; i < len; i++)
199             TChinese2SChineseData[Ostr[i]] = Ostr[0];
200     }
201 
202     fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2T[] = {");
203 
204     address=0;
205     for (i = 0; i < 0x10; i++) {
206         fprintf(cfp, "\n\t");
207         for (j = 0; j < 0x10; j++) {
208             for (k = 0; k < 0x100; k++) {
209                 if (SChinese2TChineseData[((i*0x10)+j)*0x100+k] != 0)
210                     break;
211             }
212             fprintf(
213                 cfp, "0x%04lx, ",
214                 sal::static_int_cast< unsigned long >(
215                     k < 0x100 ? (address++)*0x100 : 0xFFFF));
216         }
217     }
218     fprintf(cfp, "\n};\n");
219 
220     fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2T[] = {");
221 
222     for (i = 0; i < 0x100; i++) {
223         for (j = 0; j < 0x100; j++) {
224             if (SChinese2TChineseData[i*0x100+j] != 0)
225                 break;
226         }
227         if (j < 0x100) {
228             for (j = 0; j < 0x10; j++) {
229                 fprintf(cfp, "\n\t");
230                 for (k = 0; k < 0x10; k++) {
231                     sal_Unicode c = SChinese2TChineseData[((i*0x10+j)*0x10)+k];
232                     fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
233                 }
234             }
235         }
236     }
237     fprintf(cfp, "\n};\n");
238 
239     fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_S2V[] = {");
240 
241     address=0;
242     for (i = 0; i < 0x10; i++) {
243         fprintf(cfp, "\n\t");
244         for (j = 0; j < 0x10; j++) {
245             for (k = 0; k < 0x100; k++) {
246                 if (SChinese2VChineseData[((i*0x10)+j)*0x100+k] != 0)
247                     break;
248             }
249             fprintf(
250                 cfp, "0x%04lx, ",
251                 sal::static_int_cast< unsigned long >(
252                     k < 0x100 ? (address++)*0x100 : 0xFFFF));
253         }
254     }
255     fprintf(cfp, "\n};\n");
256 
257     fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_S2V[] = {");
258 
259     for (i = 0; i < 0x100; i++) {
260         for (j = 0; j < 0x100; j++) {
261             if (SChinese2VChineseData[i*0x100+j] != 0)
262                 break;
263         }
264         if (j < 0x100) {
265             for (j = 0; j < 0x10; j++) {
266                 fprintf(cfp, "\n\t");
267                 for (k = 0; k < 0x10; k++) {
268                     sal_Unicode c = SChinese2VChineseData[((i*0x10+j)*0x10)+k];
269                     fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
270                 }
271             }
272         }
273     }
274     fprintf(cfp, "\n};\n");
275 
276     fprintf(cfp, "\nstatic const sal_uInt16 STC_CharIndex_T2S[] = {");
277 
278     address=0;
279     for (i = 0; i < 0x10; i++) {
280         fprintf(cfp, "\n\t");
281         for (j = 0; j < 0x10; j++) {
282             for (k = 0; k < 0x100; k++) {
283                 if (TChinese2SChineseData[((i*0x10)+j)*0x100+k] != 0)
284                     break;
285             }
286             fprintf(
287                 cfp, "0x%04lx, ",
288                 sal::static_int_cast< unsigned long >(
289                     k < 0x100 ? (address++)*0x100 : 0xFFFF));
290         }
291     }
292     fprintf(cfp, "\n};\n");
293 
294     fprintf(cfp, "\nstatic const sal_Unicode STC_CharData_T2S[] = {");
295 
296     for (i = 0; i < 0x100; i++) {
297         for (j = 0; j < 0x100; j++) {
298             if (TChinese2SChineseData[i*0x100+j] != 0)
299                 break;
300         }
301         if (j < 0x100) {
302             for (j = 0; j < 0x10; j++) {
303                 fprintf(cfp, "\n\t");
304                 for (k = 0; k < 0x10; k++) {
305                     sal_Unicode c = TChinese2SChineseData[((i*0x10+j)*0x10)+k];
306                     fprintf(cfp, "0x%04x, ", c ? c : 0xFFFF);
307                 }
308             }
309         }
310     }
311     fprintf(cfp, "\n};\n");
312 
313     // create function to return arrays
314     fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2T() { return STC_CharIndex_S2T; }\n");
315     fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2T() { return STC_CharData_S2T; }\n");
316     fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_S2V() { return STC_CharIndex_S2V; }\n");
317     fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_S2V() { return STC_CharData_S2V; }\n");
318     fprintf (cfp, "\tconst sal_uInt16* getSTC_CharIndex_T2S() { return STC_CharIndex_T2S; }\n");
319     fprintf (cfp, "\tconst sal_Unicode* getSTC_CharData_T2S() { return STC_CharData_T2S; }\n");
320 }
321 
322 namespace {
323 
324 struct Index {
325     sal_uInt16 address;
326     sal_Int32 len;
327     sal_Unicode *data;
328 };
329 
330 }
331 
332 extern "C" {
Index_comp(const void * s1,const void * s2)333 static int Index_comp(const void* s1, const void* s2)
334 {
335     Index const *p1 = static_cast<Index const *>(s1), *p2 = static_cast<Index const *>(s2);
336     int result = p1->len - p2->len;
337     for (int i = 0; result == 0 && i < p1->len; i++)
338         result = *(p1->data+i) - *(p2->data+i);
339     return result;
340 }
341 }
342 
343 // Simplified/Traditional Chinese word conversion
make_stc_word(FILE * sfp,FILE * cfp)344 void make_stc_word(FILE *sfp, FILE *cfp)
345 {
346     sal_Int32 count, i, length;
347     sal_Unicode STC_WordData[0x10000];
348     std::vector<Index> STC_WordEntry_S2T(0x10000);
349     std::vector<Index> STC_WordEntry_T2S(0x10000);
350     sal_Int32 count_S2T = 0, count_T2S = 0;
351     sal_Int32 line = 0, char_total = 0;
352     char Cstr[1024];
353 
354     while (fgets(Cstr, 1024, sfp)) {
355         // input file is in UTF-8 encoding (SChinese:TChinese)
356         // don't convert last new line character to Ostr.
357         OUString Ostr(Cstr, strlen(Cstr) - 1, RTL_TEXTENCODING_UTF8);
358         sal_Int32  len = Ostr.getLength();
359         if (char_total + len + 1 > 0xFFFF) {
360             fprintf(stderr, "Word Dictionary stc_word.dic is too big (line %" SAL_PRIdINT32 ")", line);
361             return;
362         }
363         sal_Int32 sep=-1, eq=-1, gt=-1, lt=-1;
364         if (((sep = eq = Ostr.indexOf('=')) > 0) ||
365             ((sep = gt = Ostr.indexOf('>')) > 0) ||
366             ((sep = lt = Ostr.indexOf('<')) > 0)) {
367 
368             if (eq > 0 || gt > 0) {
369                 STC_WordEntry_S2T[count_S2T].address = sal::static_int_cast<sal_uInt16>( char_total );
370                 STC_WordEntry_S2T[count_S2T].len = sep;
371                 STC_WordEntry_S2T[count_S2T++].data = &STC_WordData[char_total];
372             }
373             if (eq > 0 || lt > 0) {
374                 STC_WordEntry_T2S[count_T2S].address = sal::static_int_cast<sal_uInt16>( char_total + sep + 1 );
375                 STC_WordEntry_T2S[count_T2S].len = len - sep - 1;
376                 STC_WordEntry_T2S[count_T2S++].data = &STC_WordData[char_total + sep + 1];
377             }
378             for (i = 0; i < len; i++)
379                 STC_WordData[char_total++] = (i == sep) ? 0 : Ostr[i];
380             STC_WordData[char_total++] = 0;
381         } else {
382             fprintf(stderr, "Invalid entry in stc_word.dic (line %" SAL_PRIdINT64 ")", sal_Int64(line));
383             return;
384         }
385         line++;
386     }
387 
388     if (char_total > 0) {
389         fprintf(cfp, "\nstatic const sal_Unicode STC_WordData[] = {");
390         for (i = 0; i < char_total; i++) {
391             if (i % 32 == 0) fprintf(cfp, "\n\t");
392             fprintf(cfp, "0x%04x, ", STC_WordData[i]);
393         }
394         fprintf(cfp, "\n};\n");
395 
396         fprintf(cfp, "\nstatic sal_Int32 STC_WordData_Count = %" SAL_PRIdINT32 ";\n", sal_Int32(char_total));
397 
398         // create function to return arrays
399         fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = STC_WordData_Count; return STC_WordData; }\n");
400     } else {
401         fprintf (cfp, "\tconst sal_Unicode* getSTC_WordData(sal_Int32& count) { count = 0; return NULL; }\n");
402     }
403 
404     sal_uInt16 STC_WordIndex[0x100];
405 
406     if (count_S2T > 0) {
407         qsort(STC_WordEntry_S2T.data(), count_S2T, sizeof(Index), Index_comp);
408 
409         fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_S2T[] = {");
410         count = 0;
411         length = 0;
412         for (i = 0; i < count_S2T; i++) {
413             if (i % 32 == 0) fprintf(cfp, "\n\t");
414             fprintf(cfp, "0x%04x, ", STC_WordEntry_S2T[i].address);
415             if (STC_WordEntry_S2T[i].len != length) {
416                 length = STC_WordEntry_S2T[i].len;
417                 while (count <= length)
418                     STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
419             }
420         }
421         fprintf(cfp, "\n};\n");
422         STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
423 
424         fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_S2T[] = {");
425         for (i = 0; i < count; i++) {
426             if (i % 16 == 0) fprintf(cfp, "\n\t");
427             fprintf(cfp, "0x%04x, ", STC_WordIndex[i]);
428         }
429         fprintf(cfp, "\n};\n");
430 
431         fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_S2T_Count = %" SAL_PRIdINT64 ";\n", sal_Int64(length));
432         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return STC_WordEntry_S2T; }\n");
433         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = STC_WordIndex_S2T_Count; return STC_WordIndex_S2T; }\n");
434     } else {
435         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_S2T() { return NULL; }\n");
436         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_S2T(sal_Int32& count) { count = 0; return NULL; }\n");
437     }
438 
439     if (count_T2S > 0) {
440         qsort(STC_WordEntry_T2S.data(), count_T2S, sizeof(Index), Index_comp);
441 
442         fprintf(cfp, "\nstatic const sal_uInt16 STC_WordEntry_T2S[] = {");
443         count = 0;
444         length = 0;
445         for (i = 0; i < count_T2S; i++) {
446             if (i % 32 == 0) fprintf(cfp, "\n\t");
447             fprintf(cfp, "0x%04x, ", STC_WordEntry_T2S[i].address);
448             if (STC_WordEntry_T2S[i].len != length) {
449                 length = STC_WordEntry_T2S[i].len;
450                 while (count <= length)
451                     STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
452             }
453         }
454         STC_WordIndex[count++] = sal::static_int_cast<sal_uInt16>(i);
455         fprintf(cfp, "\n};\n");
456 
457         fprintf(cfp, "\nstatic const sal_uInt16 STC_WordIndex_T2S[] = {");
458         for (i = 0; i < count; i++) {
459             if (i % 16 == 0) fprintf(cfp, "\n\t");
460             fprintf(cfp, "0x%04x, ",  STC_WordIndex[i]);
461         }
462         fprintf(cfp, "\n};\n");
463 
464         fprintf(cfp, "\nstatic sal_Int32 STC_WordIndex_T2S_Count = %" SAL_PRIdINT64 ";\n\n", sal_Int64(length));
465         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return STC_WordEntry_T2S; }\n");
466         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = STC_WordIndex_T2S_Count; return STC_WordIndex_T2S; }\n");
467     } else {
468         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordEntry_T2S() { return NULL; }\n");
469         fprintf (cfp, "\tconst sal_uInt16* getSTC_WordIndex_T2S(sal_Int32& count) { count = 0; return NULL; }\n");
470     }
471 }
472 
473 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
474