-
Notifications
You must be signed in to change notification settings - Fork 1
/
replication.go
224 lines (204 loc) · 7.22 KB
/
replication.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
package ortfodb
import (
"fmt"
"os"
"path"
"regexp"
"strings"
html2md "github.com/JohannesKaufmann/html-to-markdown"
"github.com/anaskhan96/soup"
ll "github.com/ewen-lbh/label-logger-go"
"github.com/mitchellh/mapstructure"
"gopkg.in/yaml.v2"
)
// ReplicateAll recreates a database inside targetDatabase containing all the works in works.
func (ctx *RunContext) ReplicateAll(targetDatabase string, works Database) error {
for _, work := range works {
err := ctx.ReplicateOne(targetDatabase, work)
if err != nil {
return err
}
}
return nil
}
// ReplicateOne creates a description.md file in targetDatabase in the correct folder in order to replicate Work.
func (ctx *RunContext) ReplicateOne(targetDatabase string, work Work) error {
//TODO: make file mode configurable
workDirectory := path.Join(targetDatabase, work.ID)
os.MkdirAll(workDirectory, os.FileMode(0o0777))
description, err := ctx.ReplicateDescription(work)
if err != nil {
return fmt.Errorf("while replicating %s: %w", work.ID, err)
}
os.WriteFile(path.Join(workDirectory, "description.md"), []byte(description), os.FileMode(0o0777))
return nil
}
// ReplicateDescription reconstructs the contents of a description.md file from a Work struct.
func (ctx *RunContext) ReplicateDescription(work Work) (string, error) {
var result string
// Start with the YAML header, this one is never localized
yamlHeader, err := ctx.replicateMetadata(work.Metadata)
if err != nil {
return "", err
}
result += yamlHeader + "\n"
// TODO get rid of "default" language behavior
// if a file has NO language markers, auto-insert ":: (machine's language)" before parsing.
for language := range work.Content {
result += ctx.replicateLanguageMarker(language) + "\n\n"
replicatedBlock, err := ctx.replicateLocalizedBlock(work, language)
if err != nil {
return "", err
}
result += replicatedBlock + "\n\n"
}
return strings.TrimSpace(result), nil
}
func (ctx *RunContext) replicateLocalizedBlock(work Work, language string) (string, error) {
var result string
end := "\n\n"
content := work.Content[language]
// Abbreviations will be stored here to declare them in the markdown
abbreviations := make(Abbreviations)
// Start with the title
if content.Title != "" {
result += ctx.replicateTitle(content.Title) + end
}
// Then, for each block (ordered by the layout)
// spew.Dump(work)
for _, block := range content.Blocks {
ll.Debug("replicating %s block #%s", block.Type, block.ID)
switch block.Type {
case "media":
result += ctx.replicateMediaEmbed(block.Media) + end
case "link":
result += ctx.replicateLink(block.Link) + end
case "paragraph":
replicatedParagraph, err := ctx.replicateParagraph(block.Anchor, block.Paragraph)
if err != nil {
return "", err
}
// This is not finished: we need to properly translate to markdown abbreviations & footnotes
parsedHTML := soup.HTMLParse(string(block.Content))
abbreviations = merge(abbreviations, ctx.collectAbbreviations(parsedHTML))
replicatedParagraph = ctx.transformAbbreviations(parsedHTML, replicatedParagraph)
replicatedParagraph = ctx.transformFootnoteReferences(replicatedParagraph)
result += replicatedParagraph + end
default: // nothing
}
}
for name, content := range content.Footnotes {
result += ctx.replicateFootnoteDefinition(name, string(content)) + end
}
result += ctx.replicateAbbreviations(abbreviations)
return result, nil
}
func (ctx *RunContext) replicateLanguageMarker(language string) string {
return ":: " + language
}
// transformFootnoteReferences turns HTML references to footnotes into markdown ones.
func (ctx *RunContext) transformFootnoteReferences(markdown string) string {
pattern := regexp.MustCompile(`\[(\d+)\]\(#fn:([^)]+)\)`)
lines := strings.Split(markdown, "\n")
transformedMarkdown := markdown
for _, line := range lines {
if pattern.MatchString(line) {
for _, groups := range pattern.FindAllStringSubmatch(line, -1) {
transformedMarkdown = strings.ReplaceAll(transformedMarkdown, groups[0], "[^"+groups[2]+"]")
}
}
}
return transformedMarkdown
}
// Remove markup from abbreviations.
func (ctx *RunContext) transformAbbreviations(htmlSoup soup.Root, markdown string) string {
transformedMarkdown := markdown
for _, abbr := range htmlSoup.FindAll("abbr") {
transformedMarkdown = strings.ReplaceAll(transformedMarkdown, abbr.HTML(), abbr.FullText())
}
return transformedMarkdown
}
func (ctx *RunContext) collectAbbreviations(htmlSoup soup.Root) Abbreviations {
abbreviations := make(Abbreviations)
for _, abbr := range htmlSoup.FindAll("abbr") {
abbreviations[abbr.FullText()] = abbr.Attrs()["title"]
}
return abbreviations
}
// We replicate all abbreviations in one function to avoid duplicates.
func (ctx *RunContext) replicateAbbreviations(abbreviations Abbreviations) string {
var result string
// Stores all the alread-replicated abbreviations' names (to handle duplicates)
replicated := make([]string, 0, len(abbreviations))
for name, definition := range abbreviations {
if stringInSlice(replicated, name) {
continue
}
result += "*[" + name + "]: " + definition + "\n"
replicated = append(replicated, definition)
}
return result
}
func (ctx *RunContext) replicateFootnoteDefinition(name string, content string) string {
return "[^" + name + "]: " + content
}
func (ctx *RunContext) replicateLink(link Link) string {
if link.Title != "" {
return "[" + link.Text.String() + `](` + link.URL + ` "` + link.Title + `")`
}
return "[" + link.Text.String() + "](" + link.URL + ")"
}
func (ctx *RunContext) replicateTitle(title HTMLString) string {
return "# " + title.Markdown()
}
func (ctx *RunContext) replicateMetadata(metadata WorkMetadata) (string, error) {
metadataOut := make(map[string]interface{})
mapstructure.Decode(metadata, &metadataOut)
yamlBytes, err := yaml.Marshal(metadataOut)
if err != nil {
return "", err
}
return "---\n" + string(yamlBytes) + "---", nil
}
func (ctx *RunContext) replicateMediaAttributesString(attributes MediaAttributes) string {
result := ""
if attributes.Autoplay {
result += string(RuneAutoplay)
}
if !attributes.Controls {
result += string(RuneHideControls)
}
if attributes.Loop {
result += string(RuneLoop)
}
return result
}
// TODO: configure whether to use >[]() syntax: never, or only for non-images
func (ctx *RunContext) replicateMediaEmbed(media Media) string {
if media.Caption != "" {
return fmt.Sprintf(`![%s %s](%s "%s")`, media.Alt, ctx.replicateMediaAttributesString(media.Attributes), string(media.RelativeSource), media.Caption)
}
return fmt.Sprintf(`![%s %s](%s)`, media.Alt, ctx.replicateMediaAttributesString(media.Attributes), string(media.RelativeSource))
}
func (ctx *RunContext) replicateParagraph(anchor string, p Paragraph) (string, error) {
markdown := p.Content.Markdown()
if strings.TrimSpace(markdown) == "" {
markdown = "<p></p>"
}
var result string
if anchor != "" {
result = "{#" + anchor + "}\n" + markdown
} else {
result = markdown
}
return result, nil
}
func (html HTMLString) Markdown() string {
// TODO: configurable domain for translating relative to absolute URLS from ortfodb.yaml
converter := html2md.NewConverter("", true, nil)
result, err := converter.ConvertString(string(html))
if err != nil {
return html.String()
}
return result
}