-
Notifications
You must be signed in to change notification settings - Fork 0
/
doAlign.js
162 lines (138 loc) · 5.62 KB
/
doAlign.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
const fs = require("fs")
const path = require("path")
const config = JSON.parse(fs.readFileSync("config.json", "utf8")) // Read the contents of the config.json file and parse it as JSON
const INPUT_LOCATION = config.locations[0].input // Get the input location from the config
const OUTPUT_LOCATION = config.locations[0].output // Get the output location from the config
for (const location of config.locations) {
console.log(`Input: ${location.input}`) // Log the input location
console.log(`Output: ${location.output}`) // Log the output location
}
main() // Call the main function
function main() {
let filenames = fs.readdirSync(INPUT_LOCATION) // Read the list of filenames in the input location synchronously
let commonDiText = ""
try {
commonDiText = fs.readFileSync(path.join(INPUT_LOCATION, `Common DI.txt`), "utf8") // Read the contents of the Common DI.txt file
console.log(`Common dictionary found at ${INPUT_LOCATION}`)
} catch (error) {
if (error.code === "ENOENT") {
console.warn(`No common dictionary found at ${INPUT_LOCATION}`)
} else {
throw error
}
}
if (filenames.includes("Common DI.txt")) {
filenames = filenames.filter((filename) => filename !== "Common DI.txt") // Remove "Common DI.txt" from the list of filenames
}
let novelNames = [...new Set(filenames.map((filename) => filename.slice(0, -7)))] // Extract novel names from filenames
let novels = novelNames
.map((novelName, index) => {
let jpText = ""
let cnText = ""
try {
jpText = fs.readFileSync(path.join(INPUT_LOCATION, `${novelName} JP.txt`), "utf8") // Read the JP text file for the current novel
cnText = fs.readFileSync(path.join(INPUT_LOCATION, `${novelName} CN.txt`), "utf8") // Read the CN text file for the current novel
} catch (error) {
if (error.code === "ENOENT") {
console.warn(`No JP/CN text found for ${novelName}`)
return
} else {
throw error
}
}
let diText = ""
try {
diText = fs.readFileSync(path.join(INPUT_LOCATION, `${novelName} DI.txt`), "utf8") // Read the dictionary file for the current novel
} catch (error) {
if (error.code === "ENOENT") {
console.warn(`No dictionary file found for ${novelName}`)
} else {
throw error
}
}
if (commonDiText) {
diText = diText + "\n" + commonDiText // Append the common dictionary text to the current novel's dictionary text
}
try {
var [textData, jpCharCount, cnCharCount] = convertTextToJSON(jpText, cnText) // Convert JP and CN text to JSON format
var [dictData] = convertDictToJSON(diText) // Convert dictionary text to JSON format
} catch (error) {
console.error(novelName)
throw error
}
return {
id_novel: index + 1,
line_count: textData.length,
ja_char_count: jpCharCount,
zh_char_count: cnCharCount,
gpt_dict: dictData,
text_data: textData,
}
})
.filter(Boolean)
let outputJSON = {
schema_version: config.schema_version,
novel_count: novels.length,
line_count_total: novels.reduce((total, novel) => total + novel.line_count, 0),
ja_char_count_total: novels.reduce((total, novel) => total + novel.ja_char_count, 0),
zh_char_count_total: novels.reduce((total, novel) => total + novel.zh_char_count, 0),
data: novels,
}
fs.writeFileSync(OUTPUT_LOCATION, JSON.stringify(outputJSON, null, 2), "utf8") // Write the output JSON to the specified location
}
/**
*
* @param {*} jpText
* @param {*} cnText
* @returns An array containing the text data stored as an array, and the character count for both languages
*/
function convertTextToJSON(jpText, cnText) {
let data = []
let jpCharCount = 0
let cnCharCount = 0
let jpLines = jpText.replace(/\r\n/g, "\n").split("\n") // Split JP text into lines
let cnLines = cnText.replace(/\r\n/g, "\n").split("\n") // Split CN text into lines
if (jpLines.length !== cnLines.length) {
throw new Error("JP and CN texts have different number of lines")
}
let length = jpLines.length
for (let i = 0; i < length; i++) {
data.push({
id_line: i + 1,
ja_text: jpLines[i],
zh_text: cnLines[i],
})
jpCharCount += jpLines[i].length
cnCharCount += cnLines[i].length
}
return [data, jpCharCount, cnCharCount]
}
/**
*
* @param {*} diText
* @returns An array containing only the dictionary data, which is also an array
*/
function convertDictToJSON(diText) {
let dict = []
if (diText === "") {
return [dict]
}
let diLines = diText.replace(/\r\n/g, "\n").split("\n") // Split dictionary text into lines
try {
diLines.forEach((line) => {
let parts = line.split("==")
let src = parts[0].trim().replace(/\t/g, "")
parts = parts[1].split("--")
let dst = parts[0].trim().replace(/\t/g, "")
let info = parts[1].trim().replace(/\t/g, "")
dict.push({
src: src,
dst: dst,
info: info,
})
})
} catch (error) {
return [dict]
}
return [dict]
}