-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcharsets.c
206 lines (187 loc) · 7.83 KB
/
charsets.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
/*
Copyright 1998-2003 Victor Wagner
Copyright 2003 Alex Ott
This file is released under the GPL. Details can be
found in the file COPYING accompanying this distribution.
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "catdoc.h"
/************************************************************************/
/* Converts char in input charset into unicode representation */
/* Should be converted to macro */
/************************************************************************/
int to_unicode (const short int *charset, int c) {
return charset[c];
}
/************************************************************************/
/* Search inverse charset record for given unicode char and returns */
/* 0-255 char value if found, -1 otherwise */
/************************************************************************/
int from_unicode (CHARSET charset, int u) {
short int *p;
/* This is really assignment, not comparation */
if ((p=charset[(unsigned)u>>8])) {
return p[u & 0xff];
} else {
return -1;
}
}
/************************************************************************/
/* Reads 8-bit char and convers it from source charset */
/************************************************************************/
int get_8bit_char (FILE *f,long *offset,long fileend, struct ole_params_t *ole_params,
unsigned char read_buf[256], int *buf_is_unicode, struct io_funcs_t *io_funcs)
{
unsigned char buf;
if (io_funcs->catdoc_read(&buf, 1, 1, f, ole_params)==0) return EOF;
(*offset)++;
return to_unicode(source_charset,buf);
}
/************************************************************************/
/* Reads 16-bit unicode value. MS-Word runs on LSB-first machine only, */
/* so read lsb first always and don't care about proper bit order */
/************************************************************************/
int get_utf16lsb (FILE *f,long *offset,long fileend, struct ole_params_t *ole_params,
unsigned char read_buf[256], int *buf_is_unicode, struct io_funcs_t *io_funcs) {
unsigned char buf[2];
int result;
result=io_funcs->catdoc_read(buf, 1, 2, f, ole_params);
if (result<0) return -1;
if (result !=2) {
return EOF;
}
(*offset)+=2;
return ((int)buf[1])|(((int)buf[0])<<8);
}
/************************************************************************/
/* Reads 16-bit unicode value written in MSB order. For processing
* non-word files . */
/************************************************************************/
int get_utf16msb (FILE *f,long *offset,long fileend, struct ole_params_t *ole_params,
unsigned char read_buf[256], int *buf_is_unicode, struct io_funcs_t *io_funcs) {
unsigned char buf[2];
int result;
result=io_funcs->catdoc_read(buf, 1, 2, f, ole_params);
if (result<0) return -1;
if (result !=2) {
return EOF;
}
(*offset)+=2;
return ((int)buf[0])|(((int)buf[1])<<8);
}
int get_utf8 (FILE *f,long *offset,long fileend, struct ole_params_t *ole_params, unsigned char read_buf[256],
int *buf_is_unicode, struct io_funcs_t *io_funcs) {
unsigned char buf[3];
int d,c;
int result;
result=io_funcs->catdoc_read(buf, 1, 1, f, ole_params);
if (result<0) return -1;
if (result==0) return EOF;
c=buf[0];
d=0;
if (c<0x80)
return c;
if (c <0xC0)
return 0xfeff; /*skip corrupted sequebces*/
if (c <0xE0) {
if (io_funcs->catdoc_read(buf+1, 1, 1, f, ole_params)<=0) return EOF;
return ((c & 0x1F)<<6 | ((char)buf[1] & 0x3F));
}
if (c <0xF0) {
if (io_funcs->catdoc_read(buf+1, 1, 2, f, ole_params)<=2) return (int)EOF;
return ((c & 0x0F)<<12)|
((buf[1] & 0x3f)<<6)|
(buf[2] & 0x3f);
}
return 0xFEFF;
}
/**************************************************************************/
/* Converts unicode char to output charset sequence. Coversion have */
/* three steps: 1. Replacement map is searched for the character in case */
/* it is not allowed for output format (% in TeX, < in HTML */
/* 2. target charset is searched for this unicode char, if it wasn't */
/* replaced. If not found, then 3. Substitution map is searched */
/**************************************************************************/
const char *convert_char(int uc, char UTFbuffer[4]) {
const char *mapped;
if ((mapped=map_subst(spec_chars,uc))) return mapped;
/* NULL target charset means UTF-8 output */
return to_utf8(uc, UTFbuffer);
}
/******************************************************************/
/* Converts given unicode character to the utf-8 sequence */
/* in the static string buffer. Buffer wouldbe overwritten upon */
/* next call */
/******************************************************************/
char *to_utf8(unsigned int uc, char utfbuffer[4]) {
int count=0;
if (uc< 0x80) {
utfbuffer[0]=uc;
count=1;
} else {
if (uc < 0x800) {
utfbuffer[count++]=0xC0 | (uc >> 6);
} else {
utfbuffer[count++]=0xE0 | (uc >>12);
utfbuffer[count++]=0x80 | ((uc >>6) &0x3F);
}
utfbuffer[count++]=0x80 | (uc & 0x3F);
}
utfbuffer[count]=0;
return utfbuffer;
}
struct cp_map {
int codepage;
enum CHARSETS charset_name;
};
const struct cp_map cp_to_charset [] = {{437, CP_437},
{850, CP_850},
{852, CP_852},
{855, CP_855},
{857, CP_857},
{860, CP_860},
{861, CP_861},
{862, CP_862},
{863, CP_863},
{864, CP_864},
{865, CP_865},
{866, CP_866},
{869, CP_869},
{874, CP_874},
{1250, CP_1250},
{1251, CP_1251},
{1252, CP_1252},
{1253, CP_1253},
{1254, CP_1254},
{1255, CP_1255},
{1256, CP_1256},
{1257, CP_1257},
{1258, CP_1258},
{10000, MAC_ROMAN},
{10004, MAC_ARABIC},
{10005, MAC_HEBREW},
{10006, MAC_GREEK},
{10007, MAC_CYRILLIC,},
{10029, MAC_CENTEURO},
{20866, KOI8_R},
{28591, C8859_1},
{28592, C8859_2},
{28593, C8859_3},
{28594, C8859_4},
{28595, C8859_5},
{28596, C8859_6},
{28597, C8859_7},
{28598, C8859_8},
{28599, C8859_9},
{28605, C8859_15},
{0, C_NULL}};
enum CHARSETS charset_from_codepage(unsigned int codepage) {
const struct cp_map *cp;
for (cp = cp_to_charset;cp->codepage!=0&& cp->codepage!=codepage;cp++);
return cp->charset_name;
}