Skip to content

Commit c2e20bf

Browse files
author
vidy
committed
Fix word bounds not calculated correctly
1 parent 5e70622 commit c2e20bf

File tree

3 files changed

+38
-34
lines changed

3 files changed

+38
-34
lines changed

Diff for: examples/text.rs

+4-3
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@ fn main() {
1111
let flow = pdf_text::run(&file, &page, &resolver, Default::default(), false).expect("can't render page");
1212

1313
for run in flow.runs {
14-
for line in run.lines {
15-
for word in line.words {
16-
println!("{}", word.text.as_str());
14+
for line in &run.lines {
15+
println!("{:?}", line.rect);
16+
for word in &line.words {
17+
println!("{}, {:?}", word.text.as_str(), word.rect);
1718
// for char in word.chars {
1819
// println!("{:?}", char);
1920
// }

Diff for: src/lib.rs

-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ pub fn run<B: Backend>(file: &pdf::file::CachedFile<B>, page: &Page, resolve: &i
2121
render_page(&mut tracer, resolve, &page, transform)?;
2222

2323
let bbox = tracer.view_box();
24-
2524
let items: Vec<DrawItem<OutlineBuilder>> = tracer.finish();
2625
//Get all patterns which may have lines and texts inside.
2726
let mut patterns = HashSet::new();

Diff for: src/text.rs

+34-30
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use crate::{flow::{Char, Rect, Word}, util::avg};
1010
pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<Item=&'a TextSpan<E>> + Clone) -> Vec<Word> {
1111
let word_gap = analyze_word_gap(items.clone());
1212
let mut words = Vec::new();
13-
let mut current_word = WordBuilder::new(out.len());
13+
let mut current_word = WordBuilder::new(out.len(), 0.0);
1414

1515
// Whether the last processed TextChar is a whitespace
1616
// ' ' Space
@@ -20,11 +20,16 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
2020
// '\u{00A0}' Non-breaking space
2121
let mut trailing_space = out.chars().last().map_or(true, |c| c.is_whitespace());
2222

23+
let mut end = 0.; // trailing edge of the last char
24+
2325
for span in items {
2426
let mut offset = 0;
2527
let tr_inv = span.transform.matrix.inverse();
2628
let x_off = (tr_inv * span.transform.vector).x();
2729

30+
if span.text.contains("summary") {
31+
dbg!(span);
32+
}
2833
let mut chars = span.chars.iter().peekable();
2934
while let Some(current) = chars.next() {
3035
// Get text for current char
@@ -42,27 +47,29 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
4247

4348
let is_whitespace = text.chars().all(|c| c.is_whitespace());
4449

45-
// byte offset
50+
// byte offsets
4651
let offset_increment = text.len();
4752
// Handle word boundaries
4853
if trailing_space && !is_whitespace {
4954
// Start new word after space
50-
current_word.start_new(out.len(), char_start);
55+
current_word = WordBuilder::new(out.len(),char_start);
5156
current_word.add_char(0, offset_increment, char_start, char_end);
57+
5258
out.extend(text.nfkc());
5359
} else if !trailing_space {
5460
if is_whitespace {
5561
// End word at space
56-
words.push(current_word.build(out, char_end));
57-
current_word = WordBuilder::new(out.len());
62+
words.push(current_word.build(out));
63+
64+
current_word = WordBuilder::new(out.len(),char_start);
5865
out.push(' ');
59-
} else if current.pos + x_off > current_word.end_pos + word_gap {
66+
} else if current.pos + x_off > end + word_gap {
6067
// End word at large gap
61-
words.push(current_word.build(out, char_end));
68+
words.push(current_word.build(out));
6269

63-
current_word = WordBuilder::new(out.len());
64-
current_word.start_new(out.len(), char_start);
70+
current_word = WordBuilder::new(out.len(), char_start);
6571
current_word.add_char(0, offset_increment, char_start, char_end);
72+
6673
out.extend(text.nfkc());
6774
} else {
6875
// Continue current word
@@ -71,16 +78,17 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
7178
out.extend(text.nfkc());
7279
}
7380
}
74-
7581
trailing_space = is_whitespace;
82+
83+
end = current.pos + x_off + current.width;
84+
7685
current_word.update_bounds(span.rect.min_y(), span.rect.max_y());
7786
}
7887
}
7988

8089
// Add final word if any
8190
if !current_word.is_empty() {
82-
let end_pos = current_word.end_pos;
83-
words.push(current_word.build(out, end_pos));
91+
words.push(current_word.build(out));
8492
}
8593

8694
words
@@ -92,50 +100,46 @@ struct WordBuilder {
92100

93101
// For calculating the layout(position, width , height) of a word
94102
start_pos: f32,
95-
end_pos: f32, // trailing edge of the last char
103+
end_pos: f32,
96104
y_min: f32,
97105
y_max: f32,
98106

99107
chars: Vec<Char>,
100108
byte_offset: usize,
101-
started: bool,
109+
new: bool,
102110
}
103111

104112
impl WordBuilder {
105-
fn new(word_start_idx: usize) -> Self {
113+
fn new(word_start_idx: usize, start_pos: f32) -> Self {
106114
Self {
107115
word_start_idx,
108-
start_pos: 0.0,
116+
start_pos,
109117
end_pos: 0.0,
110118
y_min: f32::INFINITY,
111119
y_max: -f32::INFINITY,
112120
chars: Vec::new(),
113121
byte_offset: 0,
114-
started: false,
122+
new: true,
115123
}
116124
}
117125

118-
fn start_new(&mut self, word_start_idx: usize, start_pos: f32) {
119-
self.word_start_idx = word_start_idx;
120-
self.start_pos = start_pos;
121-
self.started = true;
122-
}
123-
124126
fn add_char(&mut self, offset: usize, offset_increment: usize, start: f32, end: f32) {
125127
self.chars.push(Char {
126128
offset,
127129
pos: start,
128130
width: end - start,
129131
});
130132
self.end_pos = end;
133+
131134
self.byte_offset += offset_increment;
132135
}
133136

134137
fn update_bounds(&mut self, min_y: f32, max_y: f32) {
135-
if !self.started {
138+
if self.new {
136139
self.y_min = min_y;
137140
self.y_max = max_y;
138-
self.started = true;
141+
142+
self.new = false;
139143
} else {
140144
self.y_min = self.y_min.min(min_y);
141145
self.y_max = self.y_max.max(max_y);
@@ -146,23 +150,23 @@ impl WordBuilder {
146150
self.chars.is_empty()
147151
}
148152

149-
fn build(mut self, out: &str, end_pos: f32) -> Word {
153+
fn build(mut self, out: &str) -> Word {
150154
Word {
151155
text: out[self.word_start_idx..].into(),
152156
rect: Rect {
153157
x: self.start_pos,
154158
y: self.y_min,
155159
h: self.y_max - self.y_min,
156-
w: end_pos - self.start_pos
160+
w: self.end_pos - self.start_pos
157161
},
158162
chars: take(&mut self.chars)
159163
}
160164
}
161165
}
162166

163-
/// Calculate gaps between each char,
167+
/// Calculate gaps between each char, the return value unit is em
168+
164169
/// The most important thing here is to make sure the gap is bigger than char gap, and less than word gap.
165-
///
166170
/// for example:
167171
/// think of something like "ab____________c de"
168172
///
@@ -186,7 +190,7 @@ fn analyze_word_gap<'a, E: Encoder + 'a>(items: impl Iterator<Item=&'a TextSpan<
186190
let gaps = items.clone()
187191
.flat_map(|s| {
188192
// the transform matrix is from em space to device space
189-
// so we need to invert it
193+
// so we need to invert it, becoming device space to em space
190194
let tr_inv = s.transform.matrix.inverse();
191195
let pos = (tr_inv * s.transform.vector).x();
192196

0 commit comments

Comments
 (0)