Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions JuliaProject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[deps]
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
GeoDataFrames = "62cb38b5-d8d2-4862-a48e-6a340996859f"
GeoFormatTypes = "68eda718-8dee-11e9-39e7-89f7f65f511f"
GeoInterface = "cf35fbd7-0cd7-5166-be24-54bfbe79505f"
GeoParquet = "e99870d8-ce00-4fdd-aeee-e09192881159"
GeometryOps = "3251bfac-6a57-4b6d-aa61-ac1fef2975ab"
Parquet2 = "98572fba-bba0-415d-956f-fa77e587d26d"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
WellKnownGeometry = "0f680547-7be7-4555-8820-bb198eeb646b"
143 changes: 143 additions & 0 deletions geometryops.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import GeometryOps as GO, GeoInterface as GI, GeoFormatTypes as GFT
import GeoParquet, Parquet2, WellKnownGeometry

import Tables, Dates, Statistics
using DataFrames
# function q1(data_paths::Dict{String, String})

data_paths = Dict{String, String}()
for file in ["trip", "building", "customer", "driver", "vehicle", "zone"]
data_paths[file] = joinpath(@__DIR__, "$file.parquet")
end

"""
q1(data_paths::Dict{String, String})::DataFrame

Q1 (GeometryOps): Find trips starting within 50km of Sedona city center.
"""
q1(args...) = q1_naive(args...)

# TODO: this needs a branch-and-bound is within distance algorithm.
function q1_naive(data_paths::Dict{String, String})
trip_pq = Parquet2.Dataset(data_paths["trip"])#[!, ["t_tripkey", "t_pickuploc", "t_pickuptime"]]
trip_df = DataFrame(trip_pq; copycols = false)[!, ["t_tripkey", "t_pickuploc", "t_pickuptime"]]
trip_df.geometry = GO.tuples(GFT.WellKnownBinary.((GFT.Geom(),), trip_df.t_pickuploc); crs = GFT.EPSG(4326))
metadata!(trip_df, GI.GEOINTERFACE_CRS_KEY, GFT.EPSG(4326))
metadata!(trip_df, GI.GEOINTERFACE_GEOMETRYCOLUMNS_KEY, (:geometry,))

trip_df.pickup_lon = GI.x.(trip_df.geometry)
trip_df.pickup_lat = GI.y.(trip_df.geometry)
trip_df.distance_to_center = GO.distance.(trip_df.geometry, ((-111.7610, 34.8697),))

filtered = filter(:distance_to_center => x -> isfinite(x) && x <= 0.45, trip_df)
return sort!(filtered, [:distance_to_center, :t_tripkey])[!, ["t_tripkey", "pickup_lon", "pickup_lat", "t_pickuptime", "distance_to_center"]]
end


"""
q2(data_paths::Dict{String, String})::DataFrame

Q2 (GeometryOps): Count trips starting within Coconino County zone.

Finds the first zone row where z_name == 'Coconino County' and counts trips whose
pickup point intersects that polygon. Returns single-row DataFrame with
trip_count_in_coconino_county.

TODO:
- [ ] Implement tree accelerated point-in-polygon in GeometryOps, preferably with a prepared,
NaturallyIndexedRing implementation
"""
q2(args...) = q2_naive(args...)
# TODO: this needs tree accelerated point-in-polygon - probably requires
# fleshing out the Prepared interface.
function q2_naive(data_paths::Dict{String, String})
trip_pq = Parquet2.Dataset(data_paths["trip"])#[!, ["t_tripkey", "t_pickuploc", "t_pickuptime"]]
trip_df = DataFrame(trip_pq; copycols = false)[!, ["t_tripkey", "t_pickuploc", "t_pickuptime"]]
trip_df.geometry = GO.tuples(GFT.WellKnownBinary.((GFT.Geom(),), trip_df.t_pickuploc); crs = GFT.EPSG(4326))

zone_df = DataFrame(Parquet2.Dataset(data_paths["zone"]); copycols = false)
target_idx = findfirst(==("Coconino County"), zone_df.z_name)
if isnothing(target_idx)
return DataFrame(trip_count_in_coconino_county = [0])
end
target_poly = GFT.WellKnownBinary(GFT.Geom(), zone_df.z_boundary[target_idx]) |> GO.tuples

trip_loc_tree = GO.STRtree(trip_df.geometry)
target_extent = GI.extent(target_poly)

count = 0
GO.SpatialTreeInterface.depth_first_search(GO.intersects(target_extent), trip_loc_tree) do i
if GO.intersects(trip_df.geometry[i], target_poly)
count += 1
end
end

return DataFrame(trip_count_in_coconino_county = [count])
end


"""
q3(data_paths::Dict{String, String})::DataFrame

Q3 (GeometryOps): Monthly trip stats within 15km (10km box + 5km buffer) of Sedona center.

"""
q3(args...) = q3_naive(args...)
# TODO: this needs a branch-and-bound is within distance algorithm.
function q3_naive(data_paths::Dict{String, String})
trip_pq = Parquet2.Dataset(data_paths["trip"])#[!, ["t_tripkey", "t_pickuploc", "t_pickuptime"]]
trip_df = DataFrame(trip_pq; copycols = false)
trip_df.geometry = GO.tuples(GFT.WellKnownBinary.((GFT.Geom(),), trip_df.t_pickuploc); crs = GFT.EPSG(4326))

base_poly = GI.Polygon(
[GI.LinearRing([
(-111.9060, 34.7347),
(-111.6160, 34.7347),
(-111.6160, 35.0047),
(-111.9060, 35.0047),
(-111.9060, 34.7347),
])]
)

trip_df.distance = GO.distance.(trip_df.geometry, (base_poly,))

filtered = filter(:distance => <=(0.045), trip_df)

filtered.duration_seconds = (filtered.t_dropofftime - filtered.t_pickuptime)
filtered.pickup_month = (Dates.month.(filtered.t_pickuptime))

grouped = groupby(filtered, :pickup_month)
agg = combine(grouped, :t_tripkey => length => :total_trips, :distance => Statistics.mean => :avg_distance, :duration_seconds => Statistics.mean => :avg_duration, :t_fare => Statistics.mean => :avg_fare)
sort!(agg, :pickup_month)
return agg
end

"""
q4(data_paths::Dict{String, String})::DataFrame

Q4 (GeometryOps): Zone distribution of top 1000 trips by tip amount.
"""
q4(args...) = q4_naive(args...)

function q4_naive(data_paths::Dict{String, String})
# Algorithm to do coverage like things
# Do a dual query
# Store the result in a vector where the index is of the point and the value
# is the index of the zone
# and have a sentinel value (0) for no zone yet
# if the point has already been assigned a zone, but intersection returns
# valid for another zone, then update the value to the new zone
# IF AND ONLY IF the new zone is less than the current zone
# OR: pseudorandom selection
end

"""
q5(data_paths::Dict{String, String})::DataFrame

Q5 (GeometryOps): Monthly travel patterns for repeat customers (convex hull of dropoff locations)
"""
q5(args...) = q5_naive(args...)

function q5_naive(data_paths::Dict{String, String})

end
Loading