Skip to content

Commit

Permalink
Limit guessed maxzoom to avoid spending too many tiles on polygon fill (
Browse files Browse the repository at this point in the history
#23)

* Change sqlite3 schema to deduplicate identical tiles

* Limit guessed maxzoom to avoid spending too many tiles on polygon fill

* Fix test.

These dust polygons now have their area calculated because their
maxzoom is being guessed, so the attributes from the largest one
rather than the last one are preserved.

* Increase polygon limit to a million tiles

* Two million tiles ought to be enough for anyone, right?

* Add explanatory comments for mysterious numbers
  • Loading branch information
e-n-f authored Nov 7, 2022
1 parent 1584543 commit 622084a
Show file tree
Hide file tree
Showing 7 changed files with 128 additions and 28 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
## 2.10.1
## 2.11.0

* Change sqlite3 schema to deduplicate identical tiles
* Limit guessed maxzoom to avoid spending too many tiles on polygon fill

## 2.10.0

* Upgrade flatbuffers version

Expand Down
49 changes: 43 additions & 6 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ void *run_sort(void *v) {
return NULL;
}

void do_read_parallel(char *map, long long len, long long initial_offset, const char *reading, std::vector<struct reader> *readers, std::atomic<long long> *progress_seq, std::set<std::string> *exclude, std::set<std::string> *include, int exclude_all, int basezoom, int source, std::vector<std::map<std::string, layermap_entry> > *layermaps, int *initialized, unsigned *initial_x, unsigned *initial_y, int maxzoom, std::string layername, bool uses_gamma, std::map<std::string, int> const *attribute_types, int separator, double *dist_sum, size_t *dist_count, bool want_dist, bool filters) {
void do_read_parallel(char *map, long long len, long long initial_offset, const char *reading, std::vector<struct reader> *readers, std::atomic<long long> *progress_seq, std::set<std::string> *exclude, std::set<std::string> *include, int exclude_all, int basezoom, int source, std::vector<std::map<std::string, layermap_entry> > *layermaps, int *initialized, unsigned *initial_x, unsigned *initial_y, int maxzoom, std::string layername, bool uses_gamma, std::map<std::string, int> const *attribute_types, int separator, double *dist_sum, size_t *dist_count, double *area_sum, bool want_dist, bool filters) {
long long segs[CPUS + 1];
segs[0] = 0;
segs[CPUS] = len;
Expand All @@ -419,13 +419,15 @@ void do_read_parallel(char *map, long long len, long long initial_offset, const

double dist_sums[CPUS];
size_t dist_counts[CPUS];
double area_sums[CPUS];

std::atomic<long long> layer_seq[CPUS];
for (size_t i = 0; i < CPUS; i++) {
// To preserve feature ordering, unique id for each segment
// begins with that segment's offset into the input
layer_seq[i] = segs[i] + initial_offset;
dist_sums[i] = dist_counts[i] = 0;
area_sums[i] = 0;
}

std::vector<parse_json_args> pja;
Expand All @@ -451,6 +453,7 @@ void do_read_parallel(char *map, long long len, long long initial_offset, const
sst[i].initial_x = &initial_x[i];
sst[i].initial_y = &initial_y[i];
sst[i].dist_sum = &(dist_sums[i]);
sst[i].area_sum = &(area_sums[i]);
sst[i].dist_count = &(dist_counts[i]);
sst[i].want_dist = want_dist;
sst[i].maxzoom = maxzoom;
Expand Down Expand Up @@ -486,6 +489,7 @@ void do_read_parallel(char *map, long long len, long long initial_offset, const

*dist_sum += dist_sums[i];
*dist_count += dist_counts[i];
*area_sum += area_sums[i];

json_end_map(pja[i].jp);
}
Expand Down Expand Up @@ -604,6 +608,7 @@ struct read_parallel_arg {
std::map<std::string, int> const *attribute_types = NULL;
double *dist_sum = NULL;
size_t *dist_count = NULL;
double *area_sum = NULL;
bool want_dist = false;
bool filters = false;
};
Expand All @@ -627,7 +632,7 @@ void *run_read_parallel(void *v) {
}
madvise(map, rpa->len, MADV_RANDOM); // sequential, but from several pointers at once

do_read_parallel(map, rpa->len, rpa->offset, rpa->reading, rpa->readers, rpa->progress_seq, rpa->exclude, rpa->include, rpa->exclude_all, rpa->basezoom, rpa->source, rpa->layermaps, rpa->initialized, rpa->initial_x, rpa->initial_y, rpa->maxzoom, rpa->layername, rpa->uses_gamma, rpa->attribute_types, rpa->separator, rpa->dist_sum, rpa->dist_count, rpa->want_dist, rpa->filters);
do_read_parallel(map, rpa->len, rpa->offset, rpa->reading, rpa->readers, rpa->progress_seq, rpa->exclude, rpa->include, rpa->exclude_all, rpa->basezoom, rpa->source, rpa->layermaps, rpa->initialized, rpa->initial_x, rpa->initial_y, rpa->maxzoom, rpa->layername, rpa->uses_gamma, rpa->attribute_types, rpa->separator, rpa->dist_sum, rpa->dist_count, rpa->area_sum, rpa->want_dist, rpa->filters);

madvise(map, rpa->len, MADV_DONTNEED);
if (munmap(map, rpa->len) != 0) {
Expand All @@ -644,7 +649,7 @@ void *run_read_parallel(void *v) {
return NULL;
}

void start_parsing(int fd, STREAM *fp, long long offset, long long len, std::atomic<int> *is_parsing, pthread_t *parallel_parser, bool &parser_created, const char *reading, std::vector<struct reader> *readers, std::atomic<long long> *progress_seq, std::set<std::string> *exclude, std::set<std::string> *include, int exclude_all, int basezoom, int source, std::vector<std::map<std::string, layermap_entry> > &layermaps, int *initialized, unsigned *initial_x, unsigned *initial_y, int maxzoom, std::string layername, bool uses_gamma, std::map<std::string, int> const *attribute_types, int separator, double *dist_sum, size_t *dist_count, bool want_dist, bool filters) {
void start_parsing(int fd, STREAM *fp, long long offset, long long len, std::atomic<int> *is_parsing, pthread_t *parallel_parser, bool &parser_created, const char *reading, std::vector<struct reader> *readers, std::atomic<long long> *progress_seq, std::set<std::string> *exclude, std::set<std::string> *include, int exclude_all, int basezoom, int source, std::vector<std::map<std::string, layermap_entry> > &layermaps, int *initialized, unsigned *initial_x, unsigned *initial_y, int maxzoom, std::string layername, bool uses_gamma, std::map<std::string, int> const *attribute_types, int separator, double *dist_sum, size_t *dist_count, double *area_sum, bool want_dist, bool filters) {
// This has to kick off an intermediate thread to start the parser threads,
// so the main thread can get back to reading the next input stage while
// the intermediate thread waits for the completion of the parser threads.
Expand Down Expand Up @@ -682,6 +687,7 @@ void start_parsing(int fd, STREAM *fp, long long offset, long long len, std::ato
rpa->attribute_types = attribute_types;
rpa->dist_sum = dist_sum;
rpa->dist_count = dist_count;
rpa->area_sum = area_sum;
rpa->want_dist = want_dist;
rpa->filters = filters;

Expand Down Expand Up @@ -1330,6 +1336,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
long overall_offset = 0;
double dist_sum = 0;
size_t dist_count = 0;
double area_sum = 0;

int files_open_before_reading = open("/dev/null", O_RDONLY | O_CLOEXEC);
if (files_open_before_reading < 0) {
Expand Down Expand Up @@ -1383,13 +1390,15 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
std::atomic<long long> layer_seq[CPUS];
double dist_sums[CPUS];
size_t dist_counts[CPUS];
double area_sums[CPUS];
std::vector<struct serialization_state> sst;
sst.resize(CPUS);

for (size_t i = 0; i < CPUS; i++) {
layer_seq[i] = overall_offset;
dist_sums[i] = 0;
dist_counts[i] = 0;
area_sums[i] = 0;

sst[i].fname = reading.c_str();
sst[i].line = 0;
Expand All @@ -1402,6 +1411,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
sst[i].initialized = &initialized[i];
sst[i].dist_sum = &dist_sums[i];
sst[i].dist_count = &dist_counts[i];
sst[i].area_sum = &area_sums[i];
sst[i].want_dist = guess_maxzoom;
sst[i].maxzoom = maxzoom;
sst[i].filters = prefilter != NULL || postfilter != NULL;
Expand All @@ -1419,6 +1429,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
for (size_t i = 0; i < CPUS; i++) {
dist_sum += dist_sums[i];
dist_count += dist_counts[i];
area_sum = area_sums[i];
}

if (munmap(map, st.st_size) != 0) {
Expand Down Expand Up @@ -1452,13 +1463,15 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
std::atomic<long long> layer_seq[CPUS];
double dist_sums[CPUS];
size_t dist_counts[CPUS];
double area_sums[CPUS];
std::vector<struct serialization_state> sst;
sst.resize(CPUS);

for (size_t i = 0; i < CPUS; i++) {
layer_seq[i] = overall_offset;
dist_sums[i] = 0;
dist_counts[i] = 0;
area_sums[i] = 0;

sst[i].fname = reading.c_str();
sst[i].line = 0;
Expand All @@ -1471,6 +1484,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
sst[i].initialized = &initialized[i];
sst[i].dist_sum = &dist_sums[i];
sst[i].dist_count = &dist_counts[i];
sst[i].area_sum = &area_sums[i];
sst[i].want_dist = guess_maxzoom;
sst[i].maxzoom = maxzoom;
sst[i].filters = prefilter != NULL || postfilter != NULL;
Expand All @@ -1488,6 +1502,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
for (size_t i = 0; i < CPUS; i++) {
dist_sum += dist_sums[i];
dist_count += dist_counts[i];
area_sum += area_sums[i];
}

if (munmap(map, st.st_size) != 0) {
Expand All @@ -1508,6 +1523,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
std::atomic<long long> layer_seq[CPUS];
double dist_sums[CPUS];
size_t dist_counts[CPUS];
double area_sums[CPUS];

std::vector<struct serialization_state> sst;
sst.resize(CPUS);
Expand All @@ -1517,6 +1533,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
layer_seq[i] = overall_offset;
dist_sums[i] = 0;
dist_counts[i] = 0;
area_sums[i] = 0;

sst[i].fname = reading.c_str();
sst[i].line = 0;
Expand All @@ -1529,6 +1546,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
sst[i].initialized = &initialized[i];
sst[i].dist_sum = &dist_sums[i];
sst[i].dist_count = &dist_counts[i];
sst[i].area_sum = &area_sums[i];
sst[i].want_dist = guess_maxzoom;
sst[i].maxzoom = maxzoom;
sst[i].filters = prefilter != NULL || postfilter != NULL;
Expand Down Expand Up @@ -1590,7 +1608,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
}

if (map != NULL && map != MAP_FAILED && read_parallel_this) {
do_read_parallel(map, st.st_size - off, overall_offset, reading.c_str(), &readers, &progress_seq, exclude, include, exclude_all, basezoom, layer, &layermaps, initialized, initial_x, initial_y, maxzoom, sources[layer].layer, uses_gamma, attribute_types, read_parallel_this, &dist_sum, &dist_count, guess_maxzoom, prefilter != NULL || postfilter != NULL);
do_read_parallel(map, st.st_size - off, overall_offset, reading.c_str(), &readers, &progress_seq, exclude, include, exclude_all, basezoom, layer, &layermaps, initialized, initial_x, initial_y, maxzoom, sources[layer].layer, uses_gamma, attribute_types, read_parallel_this, &dist_sum, &dist_count, &area_sum, guess_maxzoom, prefilter != NULL || postfilter != NULL);
overall_offset += st.st_size - off;
checkdisk(&readers);

Expand Down Expand Up @@ -1668,7 +1686,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
}

fflush(readfp);
start_parsing(readfd, streamfpopen(readfp), initial_offset, ahead, &is_parsing, &parallel_parser, parser_created, reading.c_str(), &readers, &progress_seq, exclude, include, exclude_all, basezoom, layer, layermaps, initialized, initial_x, initial_y, maxzoom, sources[layer].layer, gamma != 0, attribute_types, read_parallel_this, &dist_sum, &dist_count, guess_maxzoom, prefilter != NULL || postfilter != NULL);
start_parsing(readfd, streamfpopen(readfp), initial_offset, ahead, &is_parsing, &parallel_parser, parser_created, reading.c_str(), &readers, &progress_seq, exclude, include, exclude_all, basezoom, layer, layermaps, initialized, initial_x, initial_y, maxzoom, sources[layer].layer, gamma != 0, attribute_types, read_parallel_this, &dist_sum, &dist_count, &area_sum, guess_maxzoom, prefilter != NULL || postfilter != NULL);

initial_offset += ahead;
overall_offset += ahead;
Expand Down Expand Up @@ -1705,7 +1723,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
fflush(readfp);

if (ahead > 0) {
start_parsing(readfd, streamfpopen(readfp), initial_offset, ahead, &is_parsing, &parallel_parser, parser_created, reading.c_str(), &readers, &progress_seq, exclude, include, exclude_all, basezoom, layer, layermaps, initialized, initial_x, initial_y, maxzoom, sources[layer].layer, gamma != 0, attribute_types, read_parallel_this, &dist_sum, &dist_count, guess_maxzoom, prefilter != NULL || postfilter != NULL);
start_parsing(readfd, streamfpopen(readfp), initial_offset, ahead, &is_parsing, &parallel_parser, parser_created, reading.c_str(), &readers, &progress_seq, exclude, include, exclude_all, basezoom, layer, layermaps, initialized, initial_x, initial_y, maxzoom, sources[layer].layer, gamma != 0, attribute_types, read_parallel_this, &dist_sum, &dist_count, &area_sum, guess_maxzoom, prefilter != NULL || postfilter != NULL);

if (parser_created) {
if (pthread_join(parallel_parser, NULL) != 0) {
Expand Down Expand Up @@ -1735,6 +1753,7 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
sst.initialized = &initialized[0];
sst.dist_sum = &dist_sum;
sst.dist_count = &dist_count;
sst.area_sum = &area_sum;
sst.want_dist = guess_maxzoom;
sst.maxzoom = maxzoom;
sst.filters = prefilter != NULL || postfilter != NULL;
Expand Down Expand Up @@ -2163,6 +2182,24 @@ int read_input(std::vector<source> &sources, char *fname, int maxzoom, int minzo
}
}

double total_tile_count = 0;
for (int i = 1; i <= maxzoom; i++) {
double tile_count = ceil(area_sum / ((1LL << (32 - i)) * (1LL << (32 - i))));
total_tile_count += tile_count;

// 2M tiles is an arbitrary limit, chosen to make tiling jobs
// that seem like they should finish in a few minutes
// actually finish in a few minutes. It is large enough to
// tile a polygon that covers the entire world to z10
// or the United States to z13.

if (total_tile_count > 2 * 1024 * 1024) {
printf("Limiting maxzoom to -z%d to keep from generating %lld tiles\n", i - 1, (long long) total_tile_count);
maxzoom = i - 1;
break;
}
}

if (maxzoom < minimum_maxzoom) {
if (!quiet) {
fprintf(stderr, "Using minimum maxzoom of -z%d\n", minimum_maxzoom);
Expand Down
77 changes: 65 additions & 12 deletions mbtiles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,45 @@ sqlite3 *mbtiles_open(char *dbname, char **argv, int forcetable) {
exit(EXIT_EXISTS);
}
}
if (sqlite3_exec(outdb, "CREATE TABLE tiles (zoom_level integer, tile_column integer, tile_row integer, tile_data blob);", NULL, NULL, &err) != SQLITE_OK) {
fprintf(stderr, "%s: create tiles table: %s\n", argv[0], err);
if (sqlite3_exec(outdb, "create unique index name on metadata (name);", NULL, NULL, &err) != SQLITE_OK) {
fprintf(stderr, "%s: index metadata: %s\n", argv[0], err);
if (!forcetable) {
exit(EXIT_EXISTS);
}
}
if (sqlite3_exec(outdb, "create unique index name on metadata (name);", NULL, NULL, &err) != SQLITE_OK) {
fprintf(stderr, "%s: index metadata: %s\n", argv[0], err);

// "map" maps z/x/y coordinates to a content hash
if (sqlite3_exec(outdb, "CREATE TABLE map (zoom_level INTEGER, tile_column INTEGER, tile_row INTEGER, tile_id TEXT);", NULL, NULL, &err) != SQLITE_OK) {
fprintf(stderr, "%s: create map table: %s\n", argv[0], err);
if (!forcetable) {
exit(EXIT_EXISTS);
}
}
if (sqlite3_exec(outdb, "CREATE UNIQUE INDEX map_index ON map (zoom_level, tile_column, tile_row);", NULL, NULL, &err) != SQLITE_OK) {
fprintf(stderr, "%s: create map index: %s\n", argv[0], err);
if (!forcetable) {
exit(EXIT_EXISTS);
}
}

// "images" maps a content hash to tile contents
if (sqlite3_exec(outdb, "CREATE TABLE images (tile_data blob, tile_id text);", NULL, NULL, &err) != SQLITE_OK) {
fprintf(stderr, "%s: create images table: %s\n", argv[0], err);
if (!forcetable) {
exit(EXIT_EXISTS);
}
}
if (sqlite3_exec(outdb, "create unique index tile_index on tiles (zoom_level, tile_column, tile_row);", NULL, NULL, &err) != SQLITE_OK) {
fprintf(stderr, "%s: index tiles: %s\n", argv[0], err);
if (sqlite3_exec(outdb, "CREATE UNIQUE INDEX images_id ON images (tile_id);", NULL, NULL, &err) != SQLITE_OK) {
fprintf(stderr, "%s: create images index: %s\n", argv[0], err);
if (!forcetable) {
exit(EXIT_EXISTS);
}
}

// "tiles" is a view that retrieves content from "images"
// via the content hash looked up from "map".
if (sqlite3_exec(outdb, "CREATE VIEW tiles AS SELECT map.zoom_level AS zoom_level, map.tile_column AS tile_column, map.tile_row AS tile_row, images.tile_data AS tile_data FROM map JOIN images ON images.tile_id = map.tile_id;", NULL, NULL, &err) != SQLITE_OK) {
fprintf(stderr, "%s: create tiles view: %s\n", argv[0], err);
if (!forcetable) {
exit(EXIT_EXISTS);
}
Expand All @@ -75,23 +100,51 @@ sqlite3 *mbtiles_open(char *dbname, char **argv, int forcetable) {
}

void mbtiles_write_tile(sqlite3 *outdb, int z, int tx, int ty, const char *data, int size) {
// Store tiles by a hash of their contents. node-mbtiles uses MD5,
// but I am resisting adding the dependency, so instead here is
// everybody's first hash function. It is the same as Java's String.hashCode(),
// https://docs.oracle.com/javase/6/docs/api/java/lang/String.html#hashCode()
unsigned long long h = 0;
for (int i = 0; i < size; i++) {
h = h * 31 + data[i];
}
std::string hash = std::to_string(h);

// following https://github.com/mapbox/node-mbtiles/blob/master/lib/mbtiles.js

sqlite3_stmt *stmt;
const char *query = "insert into tiles (zoom_level, tile_column, tile_row, tile_data) values (?, ?, ?, ?)";
if (sqlite3_prepare_v2(outdb, query, -1, &stmt, NULL) != SQLITE_OK) {
fprintf(stderr, "sqlite3 insert prep failed\n");
const char *images = "replace into images (tile_id, tile_data) values (?, ?)";
if (sqlite3_prepare_v2(outdb, images, -1, &stmt, NULL) != SQLITE_OK) {
fprintf(stderr, "sqlite3 images prep failed\n");
exit(EXIT_SQLITE);
}

sqlite3_bind_blob(stmt, 1, hash.c_str(), hash.size(), NULL);
sqlite3_bind_blob(stmt, 2, data, size, NULL);

if (sqlite3_step(stmt) != SQLITE_DONE) {
fprintf(stderr, "sqlite3 images insert failed: %s\n", sqlite3_errmsg(outdb));
}
if (sqlite3_finalize(stmt) != SQLITE_OK) {
fprintf(stderr, "sqlite3 images finalize failed: %s\n", sqlite3_errmsg(outdb));
}

const char *map = "insert into map (zoom_level, tile_column, tile_row, tile_id) values (?, ?, ?, ?)";
if (sqlite3_prepare_v2(outdb, map, -1, &stmt, NULL) != SQLITE_OK) {
fprintf(stderr, "sqlite3 map prep failed\n");
exit(EXIT_SQLITE);
}

sqlite3_bind_int(stmt, 1, z);
sqlite3_bind_int(stmt, 2, tx);
sqlite3_bind_int(stmt, 3, (1 << z) - 1 - ty);
sqlite3_bind_blob(stmt, 4, data, size, NULL);
sqlite3_bind_blob(stmt, 4, hash.c_str(), hash.size(), NULL);

if (sqlite3_step(stmt) != SQLITE_DONE) {
fprintf(stderr, "sqlite3 insert failed: %s\n", sqlite3_errmsg(outdb));
fprintf(stderr, "sqlite3 map insert failed: %s\n", sqlite3_errmsg(outdb));
}
if (sqlite3_finalize(stmt) != SQLITE_OK) {
fprintf(stderr, "sqlite3 finalize failed: %s\n", sqlite3_errmsg(outdb));
fprintf(stderr, "sqlite3 map finalize failed: %s\n", sqlite3_errmsg(outdb));
}
}

Expand Down
6 changes: 5 additions & 1 deletion serial.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,7 @@ int serialize_feature(struct serialization_state *sst, serial_feature &sf) {
}

double extent = 0;
if (additional[A_DROP_SMALLEST_AS_NEEDED] || additional[A_COALESCE_SMALLEST_AS_NEEDED] || order_by_size) {
if (additional[A_DROP_SMALLEST_AS_NEEDED] || additional[A_COALESCE_SMALLEST_AS_NEEDED] || order_by_size || sst->want_dist) {
if (sf.t == VT_POLYGON) {
for (size_t i = 0; i < scaled_geometry.size(); i++) {
if (scaled_geometry[i].op == VT_MOVETO) {
Expand Down Expand Up @@ -576,6 +576,10 @@ int serialize_feature(struct serialization_state *sst, serial_feature &sf) {
sf.extent = LLONG_MAX;
}

if (sst->want_dist && sf.t == VT_POLYGON) {
*(sst->area_sum) += extent;
}

if (!prevent[P_INPUT_ORDER]) {
sf.seq = 0;
}
Expand Down
1 change: 1 addition & 0 deletions serial.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ struct serialization_state {

double *dist_sum = NULL; // running tally for calculation of resolution within features
size_t *dist_count = NULL;
double *area_sum = NULL;
bool want_dist = false;

int maxzoom = 0;
Expand Down
Loading

0 comments on commit 622084a

Please sign in to comment.