garganorn/scripts/import-overture-extract.sh at main · schuyler/garganorn · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
#!/bin/bash

if ! command -v duckdb &> /dev/null; then
    echo "duckdb not installed. Please install it first."
    echo "To install duckdb, you can use:"
    echo "  curl https://install.duckdb.org/ | sh"
    echo "or follow the instructions at https://duckdb.org/docs/installation/."
    echo
    echo "Be sure to add it to your path afterwards."
    exit 1
fi

# Define database filename
db_filename="overture-maps.duckdb"

# Extract --log option before positional parsing
log_file=""
remaining_args=()
while [ $# -gt 0 ]; do
    case "$1" in
        --log) log_file="$2"; shift 2 ;;
        *) remaining_args+=("$1"); shift ;;
    esac
done
set -- "${remaining_args[@]}"

if [ -n "$log_file" ]; then
    mkdir -p "$(dirname "$log_file")"
    exec > >(tee "$log_file") 2>&1
fi

xmin=$1
ymin=$2
xmax=$3
ymax=$4
release=$5

if [ -z "$xmin" ] || [ -z "$ymin" ] || [ -z "$xmax" ] || [ -z "$ymax" ]; then
    echo
    echo "Usage: $0 [--log <path>] <xmin> <ymin> <xmax> <ymax> [release]"
    echo "  release: Optional Overture release date (format: YYYY-MM-DD.N)"
    echo
    exit 1
fi

# Check that xmin is numerically less than xmax and ymin less than ymax
if ! [[ "$xmin" =~ ^-?[0-9]+(\.[0-9]+)?$ ]] || ! [[ "$ymin" =~ ^-?[0-9]+(\.[0-9]+)?$ ]] || \
   ! [[ "$xmax" =~ ^-?[0-9]+(\.[0-9]+)?$ ]] || ! [[ "$ymax" =~ ^-?[0-9]+(\.[0-9]+)?$ ]]; then
    echo "All coordinates must be valid numbers."
    exit 1
fi

if (( $(echo "$xmin >= $xmax" | bc -l) )) || (( $(echo "$ymin >= $ymax" | bc -l) )); then
    echo "Invalid bounding box: xmin must be less than xmax and ymin must be less than ymax."
    exit 1
fi

# Use provided release if specified, otherwise use default
if [ -n "$release" ]; then
    latest_release=$release
    echo "Using specified Overture release: $latest_release"
else
    # Auto-discover latest Overture release from S3
    echo "Auto-discovering latest Overture release..."
    latest_release=$(curl -s "https://overturemaps-us-west-2.s3.amazonaws.com/?list-type=2&prefix=release/&delimiter=/" |
      grep -o '<Prefix>release/[0-9][^<]*/</Prefix>' |
      sed 's/<Prefix>release\/\(.*\)\/<\/Prefix>/\1/' |
      sort -r |
      head -1)
    if [ -z "$latest_release" ]; then
        echo "No Overture releases found on S3."
        exit 1
    fi
    echo "Using latest Overture release: $latest_release"
fi

# Create the db/ directory in the parent folder relative to this script
script_dir="$(dirname "$(realpath "$0")")"
output_dir="${script_dir}/../db"
mkdir -p "$output_dir"

# Download and cache parquet files locally
cache_dir="${output_dir}/cache/overture/${latest_release}"
mkdir -p "$cache_dir"

# Get the list of available parquet files for places
echo "Finding available place parquet files..."
source_base="https://overturemaps-us-west-2.s3.amazonaws.com"
parquet_files=$(curl -s "${source_base}/?list-type=2&prefix=release/${latest_release}/theme=places/type=place/" |
  grep -o ">[^<]*part-[0-9]*-[^<]*.parquet<" |
  sed 's/>\(.*\)</\1/g' |
  sort)

if [ -z "$parquet_files" ]; then
  echo "No parquet files found. Please check the release date and URL format."
  echo "Showing sample of XML response:"
  curl -s "${source_base}/?list-type=2&prefix=release/${latest_release}/theme=places/type=place/" | head -50
  exit 1
fi

# Count already-cached files
file_count=$(echo "$parquet_files" | wc -l | tr -d ' ')
cached_count=0
while IFS= read -r file; do
    filename=$(basename "$file")
    if [ -f "${cache_dir}/${filename}" ]; then
        cached_count=$((cached_count + 1))
    fi
done <<< "$parquet_files"

# Download missing files
dl_count=0
while IFS= read -r file; do
    filename=$(basename "$file")
    dest="${cache_dir}/${filename}"
    if [ -f "$dest" ]; then
        continue
    fi
    dl_count=$((dl_count + 1))
    echo "Downloading ${dl_count} / $((file_count - cached_count)) (cached: ${cached_count}): ${filename}"
    if ! curl -sf -o "${dest}.tmp" "${source_base}/${file}"; then
        echo "Failed to download ${source_base}/${file}"
        rm -f "${dest}.tmp"
        exit 1
    fi
    mv "${dest}.tmp" "$dest"
done <<< "$parquet_files"

# Remove any existing temp file
rm -f "${output_dir}/${db_filename}.tmp"

# Take the first cached file to initialize the structure
first_file="${cache_dir}/$(basename "$(echo "$parquet_files" | head -1)")"

# Initialize the spatial extension and the places table
cat > "${output_dir}/import-overture.sql" <<EOF
.print "Initializing..."
SET memory_limit='48GB';
install spatial;
load spatial;
.headers off
.mode list

-- Create the places table
create table places as select * from '${first_file}' limit 0;
EOF

# Load the data from each parquet file into the places table
file_number=0

while IFS= read -r file; do
  file_number=$((file_number + 1))
  local_file="${cache_dir}/$(basename "$file")"

  cat <<EOF
SELECT printf('[%s] Importing file ${file_number}/${file_count}: $(basename "$file")', strftime(now(), '%Y-%m-%dT%H:%M:%S'));
insert into places select * from '${local_file}'
    where bbox.xmin >= ${xmin}
      and bbox.xmax <= ${xmax}
      and bbox.ymin >= ${ymin}
      and bbox.ymax <= ${ymax}
      and geometry is not null;
EOF
done <<< "$parquet_files" >> "${output_dir}/import-overture.sql"

# Create spatial index
cat >> "${output_dir}/import-overture.sql" <<EOF
SELECT printf('[%s] Creating spatial index...', strftime(now(), '%Y-%m-%dT%H:%M:%S'));
create index idx_id on places(id);
EOF

# Compute importance as normalized 0-100 integer score
# 60% window-function density (S2 level 12 cell count) + 40% category IDF
cat >> "${output_dir}/import-overture.sql" <<EOF
SELECT printf('[%s] Loading geography extension for importance scoring...', strftime(now(), '%Y-%m-%dT%H:%M:%S'));
install geography from community;
load geography;
SELECT printf('[%s] Computing importance scores...', strftime(now(), '%Y-%m-%dT%H:%M:%S'));
CREATE TEMP TABLE t_idf AS
SELECT
    categories.primary AS category,
    count(*) AS n_places,
    ln(N.total::DOUBLE / count(*)::DOUBLE) AS idf_score
FROM places
CROSS JOIN (
    SELECT count(*) AS total FROM places
    WHERE categories.primary IS NOT NULL
) N
WHERE categories.primary IS NOT NULL
GROUP BY categories.primary, N.total;
CREATE TEMP TABLE place_density AS
SELECT id,
       ln(1 + count(*) OVER (
           PARTITION BY s2_cell_parent(
               s2_cellfromlonlat(
                   (bbox.xmin + bbox.xmax) / 2.0,
                   (bbox.ymin + bbox.ymax) / 2.0
               ), 12)
       )) AS density_score
FROM places;
CREATE TEMP TABLE place_idf AS
SELECT
    p.id,
    coalesce(idf.idf_score, 0) AS idf_score
FROM places p
LEFT JOIN t_idf idf ON idf.category = p.categories.primary;
CREATE TABLE places_scored AS
SELECT p.*,
       round(
           60 * least(coalesce(d.density_score, 0) / 10.0, 1.0)
         + 40 * least(coalesce(i.idf_score, 0) / 18.0, 1.0)
       )::INTEGER AS importance
FROM places p
LEFT JOIN place_density d USING (id)
LEFT JOIN place_idf i USING (id);
DROP TABLE places;
ALTER TABLE places_scored RENAME TO places;
create index idx_id on places(id);
DROP TABLE place_density;
DROP TABLE place_idf;
DROP TABLE t_idf;
EOF

# Extract name variants
cat >> "${output_dir}/import-overture.sql" <<'EOF'
SELECT printf('[%s] Extracting name variants...', strftime(now(), '%Y-%m-%dT%H:%M:%S'));
CREATE TEMP TABLE overture_variants AS
WITH common_entries AS (
    SELECT id,
        unnest.key AS language,
        unnest."value" AS name
    FROM places,
         unnest(map_entries(names.common))
    WHERE names.common IS NOT NULL
),
rule_entries AS (
    SELECT id,
        unnest.language,
        unnest."value" AS name,
        CASE unnest.variant
            WHEN 'common'     THEN 'alternate'
            WHEN 'official'   THEN 'official'
            WHEN 'alternate'  THEN 'alternate'
            WHEN 'short'      THEN 'short'
            ELSE 'alternate'
        END AS type
    FROM places,
         unnest(names.rules)
    WHERE names.rules IS NOT NULL
),
all_variants AS (
    SELECT id, name, 'alternate' AS type, language FROM common_entries
    UNION ALL
    SELECT id, name, type, language FROM rule_entries
)
SELECT id, list({'name': name, 'type': type, 'language': language}
                ORDER BY name) AS variants
FROM all_variants
WHERE name IS NOT NULL AND name != ''
GROUP BY id;

CREATE TABLE places_with_variants AS
SELECT p.*,
       coalesce(ov.variants, []) AS variants
FROM places p
LEFT JOIN overture_variants ov USING (id);
DROP TABLE places;
ALTER TABLE places_with_variants RENAME TO places;
create index idx_id on places(id);
DROP TABLE overture_variants;
EOF

# Build name_index with trigrams in batches to avoid OOM on large datasets
cat >> "${output_dir}/import-overture.sql" <<EOF
SELECT printf('[%s] Creating name index (batched)...', strftime(now(), '%Y-%m-%dT%H:%M:%S'));
CREATE TABLE name_index (trigram VARCHAR, id VARCHAR, name VARCHAR, norm_name VARCHAR, importance INTEGER, is_variant BOOLEAN DEFAULT FALSE);
EOF

for batch_start in $(seq 0 5000000 80000000); do
    batch_end=$((batch_start + 5000000))
    cat >> "${output_dir}/import-overture.sql" <<EOF
SELECT printf('[%s] name_index batch rowid ${batch_start}–${batch_end}...', strftime(now(), '%Y-%m-%dT%H:%M:%S'));
INSERT INTO name_index
SELECT
    substr(np.norm_name, pos, 3) AS trigram,
    np.id,
    np.name,
    np.norm_name,
    np.importance,
    FALSE AS is_variant
FROM (
    SELECT
        id,
        names.primary AS name,
        lower(strip_accents(names.primary)) AS norm_name,
        coalesce(importance, 0) AS importance
    FROM places
    WHERE names.primary IS NOT NULL
      AND length(names.primary) >= 3
      AND rowid >= ${batch_start}
      AND rowid < ${batch_end}
) np
CROSS JOIN generate_series(1, length(np.norm_name) - 2) AS gs(pos);
EOF
done

cat >> "${output_dir}/import-overture.sql" <<'EOF'
SELECT printf('[%s] Indexing variant names...', strftime(now(), '%Y-%m-%dT%H:%M:%S'));
INSERT INTO name_index
WITH variant_names AS (
    SELECT id,
           unnest.name,
           lower(strip_accents(unnest.name)) AS norm_name,
           importance,
           TRUE AS is_variant
    FROM places,
         unnest(variants)
    WHERE unnest.name IS NOT NULL AND length(unnest.name) >= 3
)
SELECT substr(vn.norm_name, pos, 3) AS trigram,
       vn.id,
       vn.name,
       vn.norm_name,
       vn.importance,
       vn.is_variant
FROM variant_names vn
CROSS JOIN generate_series(1, length(vn.norm_name) - 2) AS gs(pos);
EOF

cat >> "${output_dir}/import-overture.sql" <<EOF
SELECT printf('[%s] Sorting name index by trigram...', strftime(now(), '%Y-%m-%dT%H:%M:%S'));
SET memory_limit='16GB';
CREATE TABLE name_index_sorted AS SELECT * FROM name_index ORDER BY trigram;
SET memory_limit='48GB';
DROP TABLE name_index;
ALTER TABLE name_index_sorted RENAME TO name_index;
EOF

cat >> "${output_dir}/import-overture.sql" <<EOF
SELECT printf('[%s] Analyzing...', strftime(now(), '%Y-%m-%dT%H:%M:%S'));
analyze;
EOF

# Run the import script
echo
time duckdb -bail "${output_dir}/${db_filename}.tmp" -c ".read ${output_dir}/import-overture.sql"

if [ $? -ne 0 ]; then
    echo "Failed to import data into DuckDB."
    rm -f "${output_dir}/import-overture.sql"
    exit 1
fi

# Copy over any existing database
mv "${output_dir}/${db_filename}.tmp" "${output_dir}/${db_filename}"
rm -f "${output_dir}/import-overture.sql"

echo
duckdb "${output_dir}/${db_filename}" -c "select count(*) from places;"