-
Notifications
You must be signed in to change notification settings - Fork 34
/
hp_workplace.py
244 lines (225 loc) · 12.1 KB
/
hp_workplace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
"""Parsers for HP DESK PDF.
See an example here https://h22235.www2.hp.com/hpinfo/globalcitizenship/environment/productdata/Countries/_MultiCountry/productcarbonfootprint_notebo_2020116223055953.pdf
"""
import logging
import re
import datetime
from typing import BinaryIO, Iterator, Dict, Any
import hashlib
import math
from tools.parsers.lib import data
from tools.parsers.lib import loader
from tools.parsers.lib import pdf
from tools.parsers.lib import text
from tools.parsers.lib import piechart_analyser
# A list of patterns to search in the text.
_HP_DESK_PATTERNS = (
re.compile(r'Product carbon footprint (?P<name>.*?)\s*Estimated impact'),
re.compile(r'Product (c|C)arbon (f|F)ootprint (Report)*\s*(?P<name>.{0,50}?)\s*GHG'),
re.compile(r'Estimated impact (?P<footprint>[0-9]*)\s*kgCO2 eq.'),
re.compile(r'herein.(?P<footprint>[0-9]*)\s*kg\s*CO2eq.'),
re.compile(r'Other\s*organizations\s*might\s*report\s*this\s*value\s*(as)?\s*(?P<footprint_with_error>[0-9]*)\s*\+\/\-\s*(?P<tolerance>[0-9]*)\s*kg\s*of\s*CO2-e'),
re.compile(r'Estimated\s*impact\s*(?P<footprint_with_error>[0-9]*)\s*kgCO2e(q)?\s*\+\/\-\s*(?P<tolerance>[0-9]*)\s*kg\s*(of)?\s*CO2.?e'),
re.compile(r'Lifetime of product\s*(?P<lifetime>[0-9]*) years'),
re.compile(r'Use location\s*(?P<use_location>(Europe|North America|China|WW|Worldwide))'),
re.compile(r'Use energy demand \(Yearly TEC\)\s*(?P<energy_demand>[0-9]*\.[0-9]*)\s*kWh'),
re.compile(r'Product weight\s*(?P<weight>[0-9]*\.?\s*[0-9]*)\s*kg'),
re.compile(r'Screen size\s*(?P<screen_size>[0-9]*\.?\s*[0-9]*)\s*inches'),
re.compile(r'Final manufacturing location\s*(?P<assembly_location>(Europe|North America|China|WW|Worldwide))\s+'),
re.compile(r'Copyright \s*(?P<date>[0-9]{4}) '),
re.compile(r'Use\s*(?P<gwp_use_ratio>[0-9]*\.?[0-9]*)%'),
re.compile(r'Manufacturing\s*(?P<gwp_manufacturing_ratio>[0-9]*\.?[0-9]*)%'),
re.compile(r'End (O|o)f Life\s*(?P<gwp_eol_ratio>[0-9]*\.?[0-9]*)%'),
re.compile(r'Distribution\s*(?P<gwp_transport_ratio>[0-9]*\.?[0-9]*)%'),
)
_CATEGORIES = {
'Monitor': ('Workplace', 'Monitor'),
'Desk': ('Workplace', 'Workstation'),
'Mobile Workstation': ('Workplace', 'Laptop'),
'Workstation': ('Workplace', 'Workstation'),
'Tower': ('Workplace', 'Workstation'),
'All-in-One': ('Workplace', 'Workstation'),
'aptop': ('Workplace', 'Laptop'),
'ook': ('Workplace', 'Laptop'),
'Tablet': ('Workplace', 'Tablet'),
}
_WEIGHT_PATTERNS = {
re.compile(r'eight[^0-9]*(?P<weight>[0-9]*\.?\s*[0-9]*)'),
re.compile(r'(?P<weight>[0-9]*\.?\s*[0-9]*)\s*(w|W)eight')
}
_LIFETIME_PATTERNS = {
re.compile(r'Lifetime[^0-9]*(?P<lifetime>[0-9]*\.?\s*[0-9]*)'),
re.compile(r'(?P<lifetime>[0-9])\s*Lifetime')
}
_ENERGY_PATTERNS = {
re.compile(r'energy demand\s*(?P<energy_demand>[0-9]*\.?\s*[0-9]*)'),
re.compile(r'(?P<energy_demand>[0-9]*\.?\s*[0-9]*)[^0-9]*nergy demand')
}
_SCREEN_PATTERNS = {
re.compile(r'creen size[^0-9]*(?P<screen_size>[0-9]*\.?\s*[0-9]*)[^0-9]'),
re.compile(r'(?P<screen_size>[0-9]*\.?\s*[0-9]*)[^0-9]*creen')
}
_USE_LOCATION_PATTERNS = {
re.compile(r'Use location\s*(?P<use_location>[A-Za-z\s]*)'),
re.compile(r'(?P<use_location>[A-Za-z ]*)\s*Use location\s*')
}
_MANUF_LOCATION_PATTERNS = {
re.compile(r'nufacturing location\s*(?P<assembly_location>(Europe|North America|China|WW|Worldwide))'),
re.compile(r'(?P<assembly_location>^[A-Za-z ]*)\s*(M|m)anufacturing location')
}
def parse(body: BinaryIO, pdf_filename: str) -> Iterator[data.DeviceCarbonFootprint]:
result = data.DeviceCarbonFootprintData()
result['comment'] = ''
# Parse text from PDF.
pdf_as_text = pdf.pdf2txt(body)
extracted = text.search_all_patterns(_HP_DESK_PATTERNS, pdf_as_text)
if not extracted:
logging.error('The file "%s" did not match the HP pattern', pdf_filename)
return
# Convert each matched group to our format.
if 'name' in extracted:
result['name'] = extracted['name'].strip()
for keyword, category_and_sub in _CATEGORIES.items():
if keyword in result['name']:
result['category'], result['subcategory'] = category_and_sub
break
result['name']=result['name'].replace("HP ","")
else:
logging.error('The file "%s" did not match the HP pattern (no name extracted)', pdf_filename)
if not "category" in result:
result['category'] = "Workplace"
if 'screen_size' in extracted:
result['subcategory'] = "Monitor"
else:
result['subcategory'] = "Workstation"
if 'footprint_with_error' in extracted and 'tolerance' in extracted:
result['gwp_total'] = float(extracted['footprint_with_error'])
result['gwp_error_ratio'] = round((float(extracted['tolerance']) / result['gwp_total']), 4)
if 'footprint' in extracted:
result['gwp_total'] = float(extracted['footprint'])
if 'date' in extracted:
result['report_date'] = extracted['date']
if 'weight' in extracted:
result['weight'] = float(extracted['weight'].replace(' ',''))
else:
for block, page in pdf.search_text(body, 'weight'):
temp_text = page.get_textbox((block.x0, block.y0 - 2, block.x1 + 150, block.y1 + 2))
extracted_weight = text.search_all_patterns(_WEIGHT_PATTERNS, temp_text)
if 'weight' in extracted_weight:
result['weight']=extracted_weight['weight']
break
if 'screen_size' in extracted:
result['screen_size'] = float(extracted['screen_size'])
else:
for block, page in pdf.search_text(body, 'screen size'):
temp_text = page.get_textbox((block.x0, block.y0 - 2, block.x1 + 150, block.y1 + 2))
extracted_temp = text.search_all_patterns(_SCREEN_PATTERNS, temp_text)
if 'screen_size' in extracted_temp:
result['screen_size']=extracted_temp['screen_size']
break
if 'assembly_location' in extracted:
result['assembly_location'] = extracted['assembly_location']
else:
for block, page in pdf.search_text(body, 'manufacturing location'):
temp_text = page.get_textbox((block.x0, block.y0 - 2, block.x1 + 160, block.y1 + 2))
extracted_temp = text.search_all_patterns(_MANUF_LOCATION_PATTERNS, temp_text)
if 'assembly_location' in extracted_temp:
result['assembly_location']=extracted_temp['assembly_location']
break
if 'lifetime' in extracted:
result['lifetime'] = float(extracted['lifetime'])
else:
for block, page in pdf.search_text(body, 'lifetime of pro'):
temp_text = page.get_textbox((block.x0, block.y0 - 2, block.x1 + 150, block.y1 + 2))
extracted_temp = text.search_all_patterns(_LIFETIME_PATTERNS, temp_text)
if 'lifetime' in extracted_temp:
result['lifetime']=float(extracted_temp['lifetime'])
break
if 'use_location' in extracted:
result['use_location'] = extracted['use_location']
else:
for block, page in pdf.search_text(body, 'use location'):
temp_text = page.get_textbox((block.x0, block.y0 - 2, block.x1 + 160, block.y1 + 2))
extracted_temp = text.search_all_patterns(_USE_LOCATION_PATTERNS, temp_text)
if 'use_location' in extracted_temp:
result['use_location']=extracted_temp['use_location']
break
if 'energy_demand' in extracted:
result['yearly_tec'] = float(extracted['energy_demand'].replace(' ',''))
else:
for block, page in pdf.search_text(body, 'energy demand'):
temp_text = page.get_textbox((block.x0, block.y0 - 2, block.x1 + 150, block.y1 + 2))
extracted_temp = text.search_all_patterns(_ENERGY_PATTERNS, temp_text)
if 'energy_demand' in extracted_temp:
result['yearly_tec']=float(extracted_temp['energy_demand'])
break
if 'gwp_manufacturing_ratio' in extracted:
result['gwp_manufacturing_ratio'] = float(extracted['gwp_manufacturing_ratio'])/100
if 'gwp_use_ratio' in extracted:
result['gwp_use_ratio'] = float(extracted['gwp_use_ratio'])/100
if 'gwp_eol_ratio' in extracted:
result['gwp_eol_ratio'] = float(extracted['gwp_eol_ratio'])/100
if 'gwp_transport_ratio' in extracted:
result['gwp_transport_ratio'] = float(extracted['gwp_transport_ratio'])/100
now = datetime.datetime.now()
result['added_date'] = now.strftime('%Y-%m-%d')
result['add_method'] = "HP Auto Parser"
result['manufacturer'] = "HP"
if not 'gwp_use_ratio' in extracted:
unpie = piechart_analyser.PiechartAnalyzer(debug=0)
pie_data: Dict[str, Any] = {}
for image in pdf.list_images(body):
md5 = hashlib.md5(image).hexdigest()
if (md5 == 'aa44d95aad83a5871bd7974cafd63a06'):
continue
unpie_output = unpie.analyze(image, ocrprofile='HP')
if unpie_output and len(unpie_output.keys()) > len(pie_data.keys()):
pie_data = unpie_output
if 'use' in pie_data:
break
if not pie_data:
# try with full page rendering
image = pdf.pdf2img(body, 0)
rows, columns, depth = image.shape
bottom_half = image[int(rows/2):, :, :].copy()
pie_data = unpie.analyze(bottom_half, ocrprofile='HP')
if pie_data:
if not 'prod' in pie_data:
pie_data = unpie.auto_prod(pie_data)
result = unpie.append_to_boavizta(result, pie_data)
# Apply some automatic fixes
if 'gwp_use_ratio' in result and 'yearly_tec' in result:
# compute electricity factor assuming 100% of the 'use' phase comes from the electricity consumption,
elec_factor = result['gwp_use_ratio'] * result['gwp_total'] / (result['lifetime'] * result['yearly_tec'])
if math.isfinite(elec_factor):
# Fix #1: for some items, the reported lifetime does not
# match the one used for the computation, leading to odd elecfactor.
# Those can be identified by computing what would have been the
# lifetime for a factor of 0.686kgCO2e/kWh and checking this
# number is roughly an integer
for expected_factor in [0.686, 0.525]:
expected_lifetime = elec_factor * result['lifetime'] / expected_factor
if (elec_factor<0.52 or elec_factor>0.695) and (abs(expected_lifetime-result['lifetime'])>0.6) and abs(expected_lifetime-round(expected_lifetime))<0.1:
result['lifetime'] = round(expected_lifetime)
result['comment'] = ' '.join([result['comment'], "fixed lifetime"])
elec_factor = result['gwp_use_ratio'] * result['gwp_total'] / (result['lifetime'] * result['yearly_tec'])
break
# Problem #2: for many items, there is a odd discrepancy between
# the reported CO2e values (mean & percentiles) and the mean gwp_total obtained
# by summing up the values of the bar plots. For those items, we can
# also observe that the percentages of the pie-chart match the percentage
# of the bar plot. Moreover, the 'use' phase of the bar-plot can also be
# properly recovered from "yearly_tec * lifetime * 0.525kgCO2e/kWh"
# -> Those observations suggest that the bar plots are corrects,
# but that the global mean (and percentiles) are not correct.
# -> It is important to correct this error as it also impact the embodied values.
# Those items can be found from the estimated electricity factor which is around
# 0.42 for those erroneous files.
if elec_factor > 0.34 and elec_factor < 0.46:
correction_factor = 0.525 / elec_factor
result['gwp_total'] *= correction_factor
result['comment'] = ' '.join([result['comment'], "fixed gwp_total"])
yield data.DeviceCarbonFootprint(result)
# Convenient way to run this scraper as a standalone.
if __name__ == '__main__':
loader.main(parse)