Skip to content

Commit e37036b

Browse files
author
vidy
committed
Fix word gap not handle correctly
1 parent 136df23 commit e37036b

File tree

3 files changed

+47
-27
lines changed

3 files changed

+47
-27
lines changed

src/flow.rs

+19-11
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,13 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
9191
match *node {
9292
Node::Final { ref indices } => {
9393
if indices.len() > 0 {
94-
let node_spans = indices.iter().flat_map(|&i| spans.get(i));
95-
let bbox = node_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap();
94+
let node_spans = indices.iter()
95+
.flat_map(|&i| spans.get(i));
96+
let bbox = node_spans.clone()
97+
.map(|s| s.rect)
98+
.reduce(|a, b| a.union_rect(b))
99+
.unwrap();
100+
96101
let class = classify(node_spans.clone());
97102
let mut text = String::new();
98103
let words = concat_text(&mut text, node_spans);
@@ -111,25 +116,26 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
111116
NodeTag::Line => {
112117
let mut indices = vec![];
113118
node.indices(&mut indices);
119+
114120
let line_spans = indices.iter().flat_map(|&i| spans.get(i));
115121
let bbox: RectF = line_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap().into();
116122

117-
let mut text = String::new();
118-
let words = concat_text(&mut text, line_spans.clone());
119123
let class = classify(line_spans.clone());
124+
let mut text = String::new();
125+
let words = concat_text(&mut text, line_spans);
120126

121127
let t = match class {
122128
Class::Header => RunType::Header,
123129
_ => RunType::Paragraph,
124130
};
125-
126131

127132
flow.add_line(words, t);
128133
}
129134
NodeTag::Paragraph => {
130-
assert_eq!(x.len(), 0);
135+
assert_eq!(x.len(), 0, "For a paragraph x gaps should be empty");
131136
let mut lines: Vec<(RectF, usize)> = vec![];
132137
let mut indices = vec![];
138+
133139
for n in cells {
134140
let start = indices.len();
135141
n.indices(&mut indices);
@@ -142,8 +148,10 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
142148

143149
let para_spans = indices.iter().flat_map(|&i| spans.get(i));
144150
let class = classify(para_spans.clone());
151+
// the bounding box the paragraph
145152
let bbox = lines.iter().map(|t| t.0).reduce(|a, b| a.union_rect(b)).unwrap();
146153
let line_height = avg(para_spans.map(|s| s.rect.height())).unwrap();
154+
147155
// classify the lines by this vertical line
148156
let left_margin = bbox.min_x() + 0.5 * line_height;
149157

@@ -158,9 +166,10 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
158166
left += 1;
159167
}
160168
}
169+
//typically paragraphs are indented to the right and longer than 2 lines.
170+
//then there will be a higher left count than right count.
161171

162-
// typically paragraphs are indented to the right and longer than 2 lines.
163-
// then there will be a higher left count than right count.
172+
//TODO: What if a paragraph with two lines starts at the same x? It will result in left = right.
164173
let indent = left > right;
165174

166175
let mut para_start = 0;
@@ -180,9 +189,8 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
180189
}
181190
});
182191
para_start = line_start;
183-
} else {
184-
text.push('\n');
185-
}
192+
}
193+
text.push('\n');
186194
}
187195
if end > line_start {
188196
let words = concat_text(&mut text, indices[line_start..end].iter().flat_map(|&i| spans.get(i)));

src/node.rs

+9-2
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,14 @@ pub fn exclude_header_and_footer<'a, E: Encoder>(boxes: &'a mut [(RectF, usize)]
9191
#[derive(Debug)]
9292
pub enum Node {
9393
Final { indices: Vec<usize> },
94-
Grid { x: Vec<f32>, y: Vec<f32>, cells: Vec<Node>, tag: NodeTag },
94+
Grid {
95+
// vertical gaps
96+
x: Vec<f32>,
97+
// horizontal gaps
98+
y: Vec<f32>,
99+
cells: Vec<Node>,
100+
tag: NodeTag
101+
},
95102
Table { table: table::Table<Vec<usize>> },
96103
}
97104
impl Node {
@@ -170,7 +177,7 @@ fn split<E: Encoder>(boxes: &mut [(RectF, usize)], spans: &[TextSpan<E>], lines:
170177
return overlapping_lines(boxes);
171178
}
172179

173-
//TODO: Disable the table::split for now,becuase it is not accurate
180+
//TODO: Disable the table::split for now,because it is not accurate
174181
// if x_gaps.len() > 1 && y_gaps.len() > 1 {
175182
// return table::split(boxes, spans, lines);
176183
// }

src/text.rs

+19-14
Original file line numberDiff line numberDiff line change
@@ -14,47 +14,48 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
1414
// Whether the last processed TextChar is a space
1515
let mut trailing_space = out.chars().last().map(|c| c.is_whitespace()).unwrap_or(true);
1616

17+
let mut word_start_idx = out.len();
18+
19+
// For calculating the layout(position, width , height) of a word
1720
let mut word_start_pos = 0.0;
1821
let mut word_end_pos = 0.0;
19-
20-
let mut word_start_idx = out.len();
2122
let mut y_min = f32::INFINITY;
2223
let mut y_max = -f32::INFINITY;
24+
2325
let mut word_start = true;
2426

2527
for span in items {
2628
let mut offset = 0; // byte index of last char into span.text
2729
let tr_inv = span.transform.matrix.inverse();
2830
let x_off = (tr_inv * span.transform.vector).x();
29-
31+
3032
let chars = span.chars.as_slice();
3133
for (i, c) in chars.iter().enumerate() {
3234
let next_offset = chars.get(i + 1).map_or(span.text.len(), |next| next.offset);
3335
let s: &str = &span.text[offset..next_offset];
3436

3537
let is_whitespace = s.chars().all(|c| c.is_whitespace());
38+
3639
if trailing_space {
3740
if !is_whitespace {
3841
word_start = true;
3942
word_start_idx = out.len();
40-
}
41-
trailing_space = is_whitespace;
4243

43-
out.extend(s.nfkc());
44+
out.extend(s.nfkc());
45+
}
4446
} else {
45-
trailing_space = is_whitespace;
46-
out.extend(s.nfkc());
47-
4847
if is_whitespace {
4948
words.push(Word {
50-
text: out[word_start_idx..out.len()-s.len()].into(),
49+
text: out[word_start_idx..].into(),
5150
rect: Rect {
5251
x: word_start_pos,
5352
y: y_min,
5453
h: y_max - y_min,
5554
w: word_end_pos - word_start_pos
5655
}
5756
});
57+
out.push_str(" ");
58+
word_start_idx = out.len();
5859
} else if c.pos + x_off > end + word_gap {
5960
words.push(Word {
6061
text: out[word_start_idx..].into(),
@@ -66,13 +67,17 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
6667
}
6768
});
6869

69-
out.push(' ');
70-
trailing_space = true;
7170
word_start = true;
72-
word_start_idx = out.len() - 1;
71+
word_start_idx = out.len();
72+
73+
out.extend(s.nfkc());
74+
} else {
75+
out.extend(s.nfkc());
7376
}
7477
}
7578

79+
trailing_space = is_whitespace;
80+
7681
end = c.pos + x_off + c.width;
7782
word_end_pos = (span.transform.matrix * Vector2F::new(end, 0.0)).x();
7883

@@ -89,7 +94,7 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
8994
offset = next_offset;
9095
}
9196
}
92-
97+
9398
words.push(Word {
9499
text: out[word_start_idx..].into(),
95100
rect: Rect {

0 commit comments

Comments
 (0)