Skip to content

Commit 4fd02b4

Browse files
authored
Fix loading issues with dehydrated load (#121)
* Fix issue with csv field size limit * Remove csv usage, fix loading quoted json This fixes a failure to load JSON with quoted values in the projection information. The added test fails without these changes. Moves away from using the csv package and instead splits on tabs while guarding against the case where the Item content contains tab characters. * Add missing test files
1 parent d3997cd commit 4fd02b4

File tree

5 files changed

+211
-3
lines changed

5 files changed

+211
-3
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# Changelog
2+
3+
## [Unreleased]
4+
5+
### Fixed
6+
7+
- Fix failure of pypgstac load for large items [#121](https://github.com/stac-utils/pgstac/pull/121)
8+
29
## [v0.6.4]
10+
311
### Fixed
412
- Fixed casts for numeric data when a property is not in the queryables table to use the type from the incoming json filter
513
- Fixed issue loader grouping an unordered iterable by partition, speeding up loads of items with mixed partitions [#116](https://github.com/stac-utils/pgstac/pull/116)

pypgstac/pypgstac/load.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
Generator,
2121
TextIO,
2222
)
23-
import csv
2423
import orjson
2524
import psycopg
2625
from orjson import JSONDecodeError
@@ -512,6 +511,8 @@ def read_dehydrated(self, file: Union[Path, str] = "stdin") -> Generator:
512511
if isinstance(file, str):
513512
open_file: Any = open_std(file, "r")
514513
with open_file as f:
514+
# Note: if 'content' is changed to be anything
515+
# but the last field, the logic below will break.
515516
fields = [
516517
"id",
517518
"geometry",
@@ -520,8 +521,21 @@ def read_dehydrated(self, file: Union[Path, str] = "stdin") -> Generator:
520521
"end_datetime",
521522
"content",
522523
]
523-
csvreader = csv.DictReader(f, fields, delimiter="\t")
524-
for item in csvreader:
524+
525+
for line in f:
526+
tab_split = line.split("\t")
527+
item = {}
528+
for i, field in enumerate(fields):
529+
if field == "content":
530+
# Join the remaining splits in case
531+
# there were any tabs in the JSON content.
532+
content_value = "\t".join(tab_split[i:])
533+
# Replace quote characters that can be
534+
# written on export and causes failures.
535+
content_value = content_value.replace(r'\\"', r"\"")
536+
item[field] = content_value
537+
else:
538+
item[field] = tab_split[i]
525539
item["partition"] = self._partition_update(item)
526540
yield item
527541

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
{
2+
"id": "chloris-biomass",
3+
"type": "Collection",
4+
"links": [
5+
{
6+
"rel": "items",
7+
"type": "application/geo+json",
8+
"href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/chloris-biomass/items"
9+
},
10+
{
11+
"rel": "parent",
12+
"type": "application/json",
13+
"href": "https://planetarycomputer.microsoft.com/api/stac/v1/"
14+
},
15+
{
16+
"rel": "root",
17+
"type": "application/json",
18+
"href": "https://planetarycomputer.microsoft.com/api/stac/v1/"
19+
},
20+
{
21+
"rel": "self",
22+
"type": "application/json",
23+
"href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/chloris-biomass"
24+
},
25+
{
26+
"rel": "license",
27+
"href": "https://spdx.org/licenses/CC-BY-NC-SA-4.0.html",
28+
"title": "Creative Commons Attribution Non Commercial Share Alike 4.0 International"
29+
},
30+
{
31+
"rel": "describedby",
32+
"href": "https://planetarycomputer.microsoft.com/dataset/chloris-biomass",
33+
"title": "Human readable dataset overview and reference",
34+
"type": "text/html"
35+
}
36+
],
37+
"title": "Chloris Biomass",
38+
"assets": {
39+
"thumbnail": {
40+
"href": "https://ai4edatasetspublicassets.blob.core.windows.net/assets/pc_thumbnails/chloris-biomass.jpg",
41+
"type": "image/jpg",
42+
"roles": [
43+
"thumbnail"
44+
],
45+
"title": "Chloris Biomass"
46+
}
47+
},
48+
"extent": {
49+
"spatial": {
50+
"bbox": [
51+
[
52+
-179.95,
53+
-60,
54+
179.95,
55+
90
56+
]
57+
]
58+
},
59+
"temporal": {
60+
"interval": [
61+
[
62+
"2003-07-31T00:00:00Z",
63+
"2019-07-31T00:00:00Z"
64+
]
65+
]
66+
}
67+
},
68+
"license": "CC-BY-NC-SA-4.0",
69+
"keywords": [
70+
"Chloris",
71+
"Biomass",
72+
"MODIS",
73+
"Carbon"
74+
],
75+
"providers": [
76+
{
77+
"url": "http://chloris.earth/",
78+
"name": "Chloris",
79+
"roles": [
80+
"producer",
81+
"licensor"
82+
]
83+
},
84+
{
85+
"url": "https://planetarycomputer.microsoft.com",
86+
"name": "Microsoft",
87+
"roles": [
88+
"host",
89+
"processor"
90+
]
91+
}
92+
],
93+
"summaries": {
94+
"gsd": [
95+
4633
96+
]
97+
},
98+
"description": "The Chloris Global Biomass 2003 - 2019 dataset provides estimates of stock and change in aboveground biomass for Earth's terrestrial woody vegetation ecosystems. It covers the period 2003 - 2019, at annual time steps. The global dataset has a circa 4.6 km spatial resolution.\n\nThe maps and data sets were generated by combining multiple remote sensing measurements from space borne satellites, processed using state-of-the-art machine learning and statistical methods, validated with field data from multiple countries. The dataset provides direct estimates of aboveground stock and change, and are not based on land use or land cover area change, and as such they include gains and losses of carbon stock in all types of woody vegetation - whether natural or plantations.\n\nAnnual stocks are expressed in units of tons of biomass. Annual changes in stocks are expressed in units of CO2 equivalent, i.e., the amount of CO2 released from or taken up by terrestrial ecosystems for that specific pixel.\n\nThe spatial data sets are available on [Microsoft’s Planetary Computer](https://planetarycomputer.microsoft.com/dataset/chloris-biomass) under a Creative Common license of the type Attribution-Non Commercial-Share Alike [CC BY-NC-SA](https://spdx.org/licenses/CC-BY-NC-SA-4.0.html).\n\n[Chloris Geospatial](https://chloris.earth/) is a mission-driven technology company that develops software and data products on the state of natural capital for use by business, governments, and the social sector.\n",
99+
"item_assets": {
100+
"biomass": {
101+
"type": "image/tiff; application=geotiff; profile=cloud-optimized",
102+
"roles": [
103+
"data"
104+
],
105+
"title": "Annual estimates of aboveground woody biomass.",
106+
"raster:bands": [
107+
{
108+
"unit": "tonnes",
109+
"nodata": 2147483647,
110+
"data_type": "uint32"
111+
}
112+
]
113+
},
114+
"biomass_wm": {
115+
"type": "image/tiff; application=geotiff; profile=cloud-optimized",
116+
"roles": [
117+
"data"
118+
],
119+
"title": "Annual estimates of aboveground woody biomass (Web Mercator).",
120+
"raster:bands": [
121+
{
122+
"unit": "tonnes",
123+
"nodata": 2147483647,
124+
"data_type": "uint32"
125+
}
126+
]
127+
},
128+
"biomass_change": {
129+
"type": "image/tiff; application=geotiff; profile=cloud-optimized",
130+
"roles": [
131+
"data"
132+
],
133+
"title": "Annual estimates of changes (gains and losses) in aboveground woody biomass from the previous year.",
134+
"raster:bands": [
135+
{
136+
"unit": "tonnes",
137+
"nodata": -32768,
138+
"data_type": "int16"
139+
}
140+
]
141+
},
142+
"biomass_change_wm": {
143+
"type": "image/tiff; application=geotiff; profile=cloud-optimized",
144+
"roles": [
145+
"data"
146+
],
147+
"title": "Annual estimates of changes (gains and losses) in aboveground woody biomass from the previous year (Web Mercator).",
148+
"raster:bands": [
149+
{
150+
"unit": "tonnes",
151+
"nodata": -32768,
152+
"data_type": "int16"
153+
}
154+
]
155+
}
156+
},
157+
"stac_version": "1.0.0",
158+
"msft:container": "chloris-biomass",
159+
"stac_extensions": [
160+
"https://stac-extensions.github.io/projection/v1.0.0/schema.json",
161+
"https://stac-extensions.github.io/raster/v1.1.0/schema.json"
162+
],
163+
"msft:storage_account": "ai4edataeuwest",
164+
"msft:short_description": "The Chloris Global Biomass 2003 - 2019 dataset provides estimates of stock and change in aboveground biomass for Earth's terrestrial woody vegetation ecosystems during the period 2003 - 2019, at annual time steps. The global dataset has a circa 4.6 km spatial resolution."
165+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
chloris_biomass_50km_2017 0103000020E6100000010000000500000066666666667E66C0000000000080564066666666667E66C00000000000004EC066666666667E66400000000000004EC066666666667E6640000000000080564066666666667E66C00000000000805640 chloris-biomass 2016-07-31 00:00:00+00 2017-07-31 00:00:00+00 {"bbox": [-179.95, -60.0, 179.95, 90.0], "links": [], "assets": {"biomass": {"href": "https://ai4edataeuwest.blob.core.windows.net/chloris-biomass/cog/bio_2017.tif", "roles": "𒍟※", "file:size": 20568072}, "biomass_wm": {"href": "https://ai4edataeuwest.blob.core.windows.net/chloris-biomass/cog/bio_2017_merc.tif", "roles": "𒍟※", "file:size": 44219262, "proj:bbox": [-20037508.337301563, -8400523.19027713, 20034408.51494577, 149088735.41090125], "proj:shape": [33992, 8649], "proj:transform": [4633.12716525001, 0.0, -20037508.337301563, 0.0, -4633.127165250011, 149088735.41090125]}, "biomass_change": {"href": "https://ai4edataeuwest.blob.core.windows.net/chloris-biomass/cog/bio_change_2016-2017.tif", "roles": "𒍟※", "file:size": 11101605}, "biomass_change_wm": {"href": "https://ai4edataeuwest.blob.core.windows.net/chloris-biomass/cog/bio_change_2016-2017_merc.tif", "roles": "𒍟※", "file:size": 25997181, "proj:bbox": [-20037508.337301563, -8400523.19027713, 20034408.51494577, 149088735.41090125], "proj:shape": [33992, 8649], "proj:transform": [4633.12716525001, 0.0, -20037508.337301563, 0.0, -4633.127165250011, 149088735.41090125]}}, "properties": {"gsd": 4633, "datetime": "2017-01-01T00:00:00Z", "proj:bbox": [-20015109.354, -6671703.11790004, 20015109.35376009, 10007554.677], "proj:epsg": null, "proj:wkt2": "PROJCS[\\"unnamed\\",GEOGCS[\\"unnamed ellipse\\",DATUM[\\"unknown\\",SPHEROID[\\"unnamed\\",6371007.181,0]],PRIMEM[\\"Greenwich\\",0],UNIT[\\"degree\\",0.0174532925199433,AUTHORITY[\\"EPSG\\",\\"9122\\"]]],PROJECTION[\\"Sinusoidal\\"],PARAMETER[\\"longitude_of_center\\",0],PARAMETER[\\"false_easting\\",0],PARAMETER[\\"false_northing\\",0],UNIT[\\"metre\\",1,AUTHORITY[\\"EPSG\\",\\"9001\\"]],AXIS[\\"Easting\\",EAST],AXIS[\\"Northing\\",NORTH]]", "proj:shape": [3600, 8640], "end_datetime": "2017-07-31T00:00:00Z", "proj:transform": [4633.12716525001, 0.0, -20015109.354, 0.0, -4633.127165250011, 10007554.677], "start_datetime": "2016-07-31T00:00:00Z"}, "stac_extensions": ["https://stac-extensions.github.io/file/v2.0.0/schema.json", "https://stac-extensions.github.io/raster/v1.0.0/schema.json", "https://stac-extensions.github.io/projection/v1.0.0/schema.json"]}
2+
chloris_biomass_50km_2018 0103000020E6100000010000000500000066666666667E66C0000000000080564066666666667E66C00000000000004EC066666666667E66400000000000004EC066666666667E6640000000000080564066666666667E66C00000000000805640 chloris-biomass 2017-07-31 00:00:00+00 2018-07-31 00:00:00+00 {"bbox": [-179.95, -60.0, 179.95, 90.0], "links": [], "assets": {"biomass": {"href": "https://ai4edataeuwest.blob.core.windows.net/chloris-biomass/cog/bio_2018.tif", "roles": "𒍟※", "file:size": 20581566}, "biomass_wm": {"href": "https://ai4edataeuwest.blob.core.windows.net/chloris-biomass/cog/bio_2018_merc.tif", "roles": "𒍟※", "file:size": 44264008, "proj:bbox": [-20037508.337301563, -8400523.19027713, 20034408.51494577, 149088735.41090125], "proj:shape": [33992, 8649], "proj:transform": [4633.12716525001, 0.0, -20037508.337301563, 0.0, -4633.127165250011, 149088735.41090125]}, "biomass_change": {"href": "https://ai4edataeuwest.blob.core.windows.net/chloris-biomass/cog/bio_change_2017-2018.tif", "roles": "𒍟※", "file:size": 11278226}, "biomass_change_wm": {"href": "https://ai4edataeuwest.blob.core.windows.net/chloris-biomass/cog/bio_change_2017-2018_merc.tif", "roles": "𒍟※", "file:size": 26334351, "proj:bbox": [-20037508.337301563, -8400523.19027713, 20034408.51494577, 149088735.41090125], "proj:shape": [33992, 8649], "proj:transform": [4633.12716525001, 0.0, -20037508.337301563, 0.0, -4633.127165250011, 149088735.41090125]}}, "properties": {"gsd": 4633, "datetime": "2018-01-01T00:00:00Z", "proj:bbox": [-20015109.354, -6671703.11790004, 20015109.35376009, 10007554.677], "proj:epsg": null, "proj:wkt2": "PROJCS[\\"unnamed\\",GEOGCS[\\"unnamed ellipse\\",DATUM[\\"unknown\\",SPHEROID[\\"unnamed\\",6371007.181,0]],PRIMEM[\\"Greenwich\\",0],UNIT[\\"degree\\",0.0174532925199433,AUTHORITY[\\"EPSG\\",\\"9122\\"]]],PROJECTION[\\"Sinusoidal\\"],PARAMETER[\\"longitude_of_center\\",0],PARAMETER[\\"false_easting\\",0],PARAMETER[\\"false_northing\\",0],UNIT[\\"metre\\",1,AUTHORITY[\\"EPSG\\",\\"9001\\"]],AXIS[\\"Easting\\",EAST],AXIS[\\"Northing\\",NORTH]]", "proj:shape": [3600, 8640], "end_datetime": "2018-07-31T00:00:00Z", "proj:transform": [4633.12716525001, 0.0, -20015109.354, 0.0, -4633.127165250011, 10007554.677], "start_datetime": "2017-07-31T00:00:00Z"}, "stac_extensions": ["https://stac-extensions.github.io/file/v2.0.0/schema.json", "https://stac-extensions.github.io/raster/v1.0.0/schema.json", "https://stac-extensions.github.io/projection/v1.0.0/schema.json"]}

pypgstac/tests/test_load.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,3 +320,22 @@ def test_s1_grd_load_and_query(loader: Loader) -> None:
320320
)[0]
321321
item = res["features"][0]
322322
pystac.Item.from_dict(item).validate()
323+
324+
325+
def test_load_dehydrated(loader: Loader) -> None:
326+
"""Test loader for items dumped directly out of item table."""
327+
collections = [
328+
HERE / "data-files" / "hydration" / "collections" / "chloris-biomass.json",
329+
]
330+
331+
for collection in collections:
332+
loader.load_collections(
333+
str(collection),
334+
insert_mode=Methods.ignore,
335+
)
336+
337+
dehydrated_items = HERE / "data-files" / "load" / "dehydrated.txt"
338+
339+
loader.load_items(
340+
str(dehydrated_items), insert_mode=Methods.insert, dehydrated=True
341+
)

0 commit comments

Comments
 (0)