-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathencoding.go
363 lines (334 loc) · 12.1 KB
/
encoding.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
package fonts
import (
"bytes"
"log"
"github.com/benoitkugler/pdf/fonts/cmaps"
"github.com/benoitkugler/pdf/fonts/simpleencodings"
"github.com/benoitkugler/pdf/fonts/standardcmaps"
"github.com/benoitkugler/pdf/fonts/standardfonts"
"github.com/benoitkugler/pdf/fonts/type1"
type1c "github.com/benoitkugler/pdf/fonts/type1C"
"github.com/benoitkugler/pdf/model"
)
// We follow here the logic from poppler, which itself is based on the PDF spec.
// Encodings start with a base encoding, which can come from
// (in order of priority):
// 1. FontDict.Encoding or FontDict.Encoding.BaseEncoding
// - MacRoman / MacExpert / WinAnsi / Standard
// 2. embedded or external font file
// 3. default:
// - builtin --> builtin encoding
// - TrueType --> WinAnsiEncoding
// - others --> StandardEncoding
//
// and then add a list of differences (if any) from
// FontDict.Encoding.Differences.
func ResolveSimpleEncoding(font model.FontSimple) simpleencodings.Encoding {
var baseEnc *simpleencodings.Encoding
enc := font.SimpleEncoding()
if predefEnc, ok := enc.(model.SimpleEncodingPredefined); ok {
// the font dict overide the font builtin encoding
baseEnc = standardfonts.PredefinedEncodings[predefEnc]
} else if encDict, ok := enc.(*model.SimpleEncodingDict); ok && encDict.BaseEncoding != "" {
baseEnc = standardfonts.PredefinedEncodings[encDict.BaseEncoding]
} else {
// check embedded font file for base encoding
// (only for Type 1 fonts - trying to get an encoding out of a
// TrueType font is a losing proposition)
if font, ok := font.(model.FontType1); ok {
baseEnc = builtinType1Encoding(font.FontDescriptor)
}
}
if baseEnc == nil { // get default base encoding
if _, ok := font.(model.FontTrueType); ok {
baseEnc = &simpleencodings.WinAnsi
} else {
baseEnc = &simpleencodings.AdobeStandard
}
}
// merge differences into encoding
if encDict, ok := enc.(*model.SimpleEncodingDict); ok {
return encDict.Differences.Apply(*baseEnc)
}
return *baseEnc
}
// merge in a ToUnicode CMap: the toUnicode CMap takes
// precedence, but the other encoding info is allowed to fill in any
// holes
// Note: for simple fonts, the CharCode (1 byte long) and the ToUnicode CMap CID are identified
func buildSimpleFromUnicode(enc *simpleencodings.Encoding, toUnicode map[model.CID][]rune) map[rune]byte {
encToUnicode := enc.RuneToByte()
for cid, r := range toUnicode {
if cid > 255 { // invalid char code: warn and ignore it
log.Printf("invalid char code in simple ToUnicode CMap : %d > 255\n", cid)
continue
}
encToUnicode[r[0]] = byte(cid)
}
return encToUnicode
}
// // build the definitive font encoding, expressed in term
// // of Unicode codepoint to byte
// func resolveCharMapType1(t model.FontType1, userCharMap map[string]rune) map[rune]byte {
// if enc, ok := t.Encoding.(model.SimpleEncodingPredefined); ok {
// // the font dict overide the font builtin encoding
// return simpleencodings.standardfonts.PredefinedEncodings[enc].RuneToByte()
// }
// var (
// base *simpleencodings.Encoding
// diffs model.Differences
// )
// if enc, ok := t.Encoding.(*model.SimpleEncodingDict); ok { // the font modifies an encoding
// // resolve the base encoding
// if enc.BaseEncoding != "" {
// base = simpleencodings.standardfonts.PredefinedEncodings[enc.BaseEncoding]
// } else { // try and fetch the embedded font information
// base = builtinType1Encoding(t.FontDescriptor)
// }
// diffs = enc.Differences
// } else { // the font use its builtin encoding (or Standard if none is found)
// base = builtinType1Encoding(t.FontDescriptor)
// }
// return applyDifferences(diffs, userCharMap, base)
// }
// func applyDifferences(diffs model.Differences, userCharMap map[string]rune, baseEnc *simpleencodings.Encoding) map[rune]byte {
// runeMap := baseEnc.NameToRune()
// // add an eventual user name mapping
// for name, r := range userCharMap {
// runeMap[name] = r
// }
// // add the potential difference
// withDiffs := diffs.Apply(*baseEnc)
// out := make(map[rune]byte)
// for by, name := range withDiffs {
// if name == "" {
// continue // not encoded
// }
// // resolve the rune from the name: first try with the
// // encoding names
// r := runeMap[name]
// if r == 0 {
// // try a global name registry
// r, _ = glyphsnames.GlyphToRune(name)
// }
// if r == 0 {
// log.Printf("font encoding: the name <%s> has no matching rune\n", name)
// } else {
// out[r] = byte(by)
// }
// }
// return out
// }
// try to read the embedded font file and return the font builtin
// encoding. If f is nil or an error occur, default to Standard
func builtinType1Encoding(desc model.FontDescriptor) *simpleencodings.Encoding {
// special case for two standard fonts where we dont need to read the font file
if desc.FontName == "ZapfDingbats" {
return &simpleencodings.ZapfDingbats
} else if desc.FontName == "Symbol" {
return &simpleencodings.Symbol
}
if desc.FontFile == nil {
return &simpleencodings.AdobeStandard
}
content, err := desc.FontFile.Decode()
if err != nil {
log.Printf("unable to decode embedded font file: %s\n", err)
return &simpleencodings.AdobeStandard
}
isCFF := desc.FontFile.Subtype == "Type1C"
if isCFF {
enc, err := type1c.ParseEncoding(bytes.NewReader(content))
if err != nil {
log.Printf("invalid Type1C embedded font file: %s\n", err)
}
// some Type 1C font files have empty encodings, which can break the
// T1C->T1 conversion (since the 'seac' operator depends on having
// the accents in the encoding), so we fill in any gaps from
// StandardEncoding
if enc != nil {
for i, std := range simpleencodings.AdobeStandard {
if enc[i] == "" {
enc[i] = std
}
}
}
return enc
} else {
info, err := type1.ParseEncoding(bytes.NewReader(content))
if err != nil {
log.Printf("invalid Type1 embedded font file: %s\n", err)
return &simpleencodings.AdobeStandard
}
return info
}
}
// func resolveCharMapTrueType(f model.FontTrueType, userCharMap map[string]rune) map[rune]byte {
// // 9.6.6.3 - when the font has no Encoding entry, or the font descriptor’s Symbolic flag is set
// // (in which case the Encoding entry is ignored)
// // the character mapping is the "identity"
// if (f.FontDescriptor.Flags&model.Symbolic) != 0 || f.Encoding == nil {
// out := make(map[rune]byte, 256)
// cm := trueTypeCharmap(f.FontDescriptor)
// if cm == nil { // assume simple byte encoding
// for r := rune(0); r <= 255; r++ {
// out[r] = byte(r)
// }
// } else {
// // If the font contains a (3, 0) subtable, the range of character codes shall be one of these: 0x0000 - 0x00FF,
// // 0xF000 - 0xF0FF, 0xF100 - 0xF1FF, or 0xF200 - 0xF2FF. Depending on the range of codes, each byte
// // from the string shall be prepended with the high byte of the range, to form a two-byte character, which shall
// // be used to select the associated glyph description from the subtable.
// for r := range cm.Compile() {
// out[r] = byte(r) // keep the lower order byte
// }
// }
// return out
// }
// // 9.6.6.3 - if the font has a named Encoding entry of either MacRomanEncoding or WinAnsiEncoding,
// // or if the font descriptor’s Nonsymbolic flag (see Table 123) is set
// if (f.FontDescriptor.Flags&model.Nonsymbolic) != 0 || f.Encoding == model.MacRomanEncoding || f.Encoding == model.WinAnsiEncoding {
// if f.Encoding == model.MacRomanEncoding {
// return simpleencodings.MacRoman.RuneToByte()
// } else if f.Encoding == model.WinAnsiEncoding {
// return simpleencodings.WinAnsi.RuneToByte()
// } else if dict, ok := f.Encoding.(*model.SimpleEncodingDict); ok {
// var base *simpleencodings.Encoding
// if dict.BaseEncoding != "" {
// base = simpleencodings.standardfonts.PredefinedEncodings[dict.BaseEncoding]
// } else {
// base = &simpleencodings.AdobeStandard
// }
// out := applyDifferences(dict.Differences, userCharMap, base)
// // Finally, any undefined entries in the table shall be filled using StandardEncoding.
// for r, bStd := range simpleencodings.AdobeStandard.RuneToByte() {
// if _, ok := out[r]; !ok { // missing rune
// out[r] = bStd
// }
// }
// return out
// }
// }
// // default value
// return simpleencodings.AdobeStandard.RuneToByte()
// }
// // may return nil
// func trueTypeCharmap(desc model.FontDescriptor) sfnt.Cmap {
// if desc.FontFile == nil {
// return nil
// }
// content, err := desc.FontFile.Decode()
// if err != nil {
// log.Printf("unable to decode embedded font file: %s\n", err)
// return nil
// }
// font, err := sfnt.Parse(bytes.NewReader(content))
// if err != nil {
// log.Printf("invalid TrueType embedded font file: %s\n", err)
// return nil
// }
// cmap, err := font.CmapTable()
// if err != nil {
// log.Printf("unable to read Cmap table in TrueType embedded font file: %s\n", err)
// }
// return cmap
// }
// func builtinTrueTypeEncoding(desc model.FontDescriptor) *simpleencodings.Encoding {
// if desc.FontFile == nil { // we choose an arbitrary encoding
// return &simpleencodings.AdobeStandard
// }
// content, err := desc.FontFile.Decode()
// if err != nil {
// log.Printf("unable to decode embedded font file: %s\n", err)
// return &simpleencodings.AdobeStandard
// }
// font, err := sfnt.Parse(bytes.NewReader(content))
// if err != nil {
// log.Printf("invalid TrueType embedded font file: %s\n", err)
// return &simpleencodings.AdobeStandard
// }
// cmap, err := font.CmapTable()
// if err != nil {
// log.Printf("invalid encoding in TrueType embedded font file: %s\n", err)
// return &simpleencodings.AdobeStandard
// }
// fontChars := cmap.Compile()
// var glyphNames sfnt.GlyphNames
// if postTable, err := font.PostTable(); err == nil && postTable.Names != nil {
// glyphNames = postTable.Names
// }
// runes := make(map[rune]byte, len(fontChars))
// var names [256]string
// for r, index := range fontChars {
// if index > 0xFF {
// log.Printf("overflow for glyph index %d in TrueType font", index)
// }
// runes[r] = byte(index) // keep the lower order byte
// // TODO:
// // name, err := font.GlyphName(&b, index)
// // if err != nil {
// // log.Printf("glyph index %d without name: %s\n", index, err)
// // } else {
// // names[runes[r]] = name
// // }
// }
// return &simpleencodings.Encoding{Names: names, Runes: runes}
// }
// func resolveCharMapType3(f model.FontType3, userCharMap map[string]rune) map[rune]byte {
// switch enc := f.Encoding.(type) {
// case model.SimpleEncodingPredefined:
// return simpleencodings.standardfonts.PredefinedEncodings[enc].RuneToByte()
// case *model.SimpleEncodingDict:
// base := &simpleencodings.AdobeStandard
// if enc.BaseEncoding != "" {
// base = simpleencodings.standardfonts.PredefinedEncodings[enc.BaseEncoding]
// }
// return applyDifferences(enc.Differences, userCharMap, base)
// default: // should not happen according to the spec
// return simpleencodings.AdobeStandard.RuneToByte()
// }
// }
// parse the CMap and resolve the chain of UseCMap if needed
func resolveToUnicode(cmap model.UnicodeCMap) (map[model.CID][]rune, error) {
content, err := cmap.Decode()
if err != nil {
return nil, err
}
inner, err := cmaps.ParseUnicodeCMap(content)
if err != nil {
return nil, err
}
out := inner.ProperLookupTable()
var used map[model.CID][]rune
switch use := cmap.UseCMap.(type) {
case model.UnicodeCMap:
used, err = resolveToUnicode(use)
if err != nil {
return nil, err
}
case model.UnicodeCMapBasePredefined:
predef, ok := standardcmaps.ToUnicodeCMaps[model.ObjName(use)]
if !ok {
log.Printf("unknown predefined UnicodeCMap %s", use)
}
used = predef.ProperLookupTable()
}
// merged the data from the UseCMap entry
for k, v := range used {
out[k] = v
}
return out, nil
}
func resolveCharMapType0(ft model.FontType0) {
// 9.10.2 - Mapping Character Codes to Unicode Values
ft.DescendantFonts.CIDSystemInfo.ToUnicodeCMapName()
}
// build the reverse mapping. For now, we ignore the case of multiple rune,
// such as ligatures
func reverseToUnicode(m map[model.CID][]rune) map[rune]model.CID {
out := make(map[rune]model.CID, len(m))
for k, v := range m {
out[v[0]] = k
}
return out
}