Skip to content
60 changes: 60 additions & 0 deletions tests/gen_data/init_defog_sf.sql
Original file line number Diff line number Diff line change
Expand Up @@ -1370,3 +1370,63 @@ INSERT INTO writes (aid, pid) VALUES
(2, 4),
(2, 5),
(3, 5);

-------------------------------------------------------------------------------
-- For the Restaurants SCHEMA
-- https://github.com/defog-ai/defog-data/blob/main/defog_data/academic/restaurants.sql

-------------------------------------------------------------------------------

CREATE TABLE geographic (
city_name VARCHAR,
county VARCHAR,
region VARCHAR
);

CREATE TABLE location (
restaurant_id BIGINT,
house_number BIGINT,
street_name VARCHAR,
city_name VARCHAR
);

CREATE TABLE restaurant (
id BIGINT,
name VARCHAR,
food_type VARCHAR,
city_name VARCHAR,
rating FLOAT
);

INSERT INTO geographic (city_name, county, region) VALUES
('Los Angeles', 'Los Angeles', 'California'),
('New York', 'New York', 'New York'),
('San Francisco', 'San Francisco', 'California'),
('Miami', 'Miami-Dade', 'Florida'),
('Chicago', 'Cook', 'Illinois');

INSERT INTO location (restaurant_id, house_number, street_name, city_name) VALUES
(1, 123, 'Main St', 'Los Angeles'),
(2, 456, 'Maple Ave', 'Los Angeles'),
(3, 789, 'Oak St', 'Los Angeles'),
(4, 321, 'Elm St', 'New York'),
(5, 654, 'Pine Ave', 'New York'),
(6, 123, 'Pine Ave', 'New York'),
(7, 12, 'Market St', 'San Francisco'),
(8, 34, 'Mission St', 'San Francisco'),
(9, 56, 'Valencia St', 'San Francisco'),
(10, 78, 'Ocean Dr', 'Miami'),
(11, 90, 'Biscayne Rd', 'Miami');

INSERT INTO restaurant (id, rating, name, food_type, city_name) VALUES
(1, 4.5, 'The Pasta House', 'Italian', 'Los Angeles'),
(2, 3.8, 'The Burger Joint', 'American', 'Los Angeles'),
(3, 4.2, 'The Sushi Bar', 'Japanese', 'Los Angeles'),
(4, 4.7, 'The Pizza Place', 'Italian', 'New York'),
(5, 3.9, 'The Steakhouse', 'American', 'New York'),
(6, 4.3, 'The Ramen Shop', 'Japanese', 'New York'),
(7, 4.1, 'The Tacos & Burritos', 'Mexican', 'San Francisco'),
(8, 4.6, 'The Vegan Cafe', 'Vegan', 'San Francisco'),
(9, 3.7, 'The BBQ Joint', 'American', 'San Francisco'),
(10, 4.4, 'The Seafood Shack', 'Seafood', 'Miami'),
(11, 4.6, 'The Seafood Shack', 'Seafood', 'Miami');
56 changes: 56 additions & 0 deletions tests/gen_data/init_defog_sqlite.sql
Original file line number Diff line number Diff line change
Expand Up @@ -1085,3 +1085,59 @@ INSERT INTO writes (aid, pid) VALUES
(2, 4),
(2, 5),
(3, 5);

-------------------------------------------------------------------------------
--RESTAURANTS SCHEMA
CREATE TABLE geographic (
city_name TEXT,
county TEXT,
region TEXT
);

CREATE TABLE location (
restaurant_id INTEGER,
house_number INTEGER,
street_name TEXT,
city_name TEXT
);

CREATE TABLE restaurant (
id INTEGER,
name TEXT,
food_type TEXT,
city_name TEXT,
rating REAL
);

INSERT INTO geographic (city_name, county, region) VALUES
('Los Angeles', 'Los Angeles', 'California'),
('New York', 'New York', 'New York'),
('San Francisco', 'San Francisco', 'California'),
('Miami', 'Miami-Dade', 'Florida'),
('Chicago', 'Cook', 'Illinois');

INSERT INTO location (restaurant_id, house_number, street_name, city_name) VALUES
(1, 123, 'Main St', 'Los Angeles'),
(2, 456, 'Maple Ave', 'Los Angeles'),
(3, 789, 'Oak St', 'Los Angeles'),
(4, 321, 'Elm St', 'New York'),
(5, 654, 'Pine Ave', 'New York'),
(6, 123, 'Pine Ave', 'New York'),
(7, 12, 'Market St', 'San Francisco'),
(8, 34, 'Mission St', 'San Francisco'),
(9, 56, 'Valencia St', 'San Francisco'),
(10, 78, 'Ocean Dr', 'Miami'),
(11, 90, 'Biscayne Rd', 'Miami');

INSERT INTO restaurant (id, rating, name, food_type, city_name) VALUES
(1, 4.5, 'The Pasta House', 'Italian', 'Los Angeles'),
(2, 3.8, 'The Burger Joint', 'American', 'Los Angeles'),
(3, 4.2, 'The Sushi Bar', 'Japanese', 'Los Angeles'),
(4, 4.7, 'The Pizza Place', 'Italian', 'New York'),
(5, 3.9, 'The Steakhouse', 'American', 'New York'),
(6, 4.3, 'The Ramen Shop', 'Japanese', 'New York'),
(7, 4.1, 'The Tacos & Burritos', 'Mexican', 'San Francisco'),
(8, 4.6, 'The Vegan Cafe', 'Vegan', 'San Francisco'),
(9, 3.7, 'The BBQ Joint', 'American', 'San Francisco'),
(10, 4.4, 'The Seafood Shack', 'Seafood', 'Miami'),
(11, 4.6, 'The Seafood Shack', 'Seafood', 'Miami');
227 changes: 227 additions & 0 deletions tests/test_metadata/defog_graphs.json
Original file line number Diff line number Diff line change
Expand Up @@ -4432,5 +4432,232 @@
"synonyms": ["publication's author", "work's author"]
}
]
},
{
"name": "Restaurants",
"version": "V2",
"collections": [
{
"name": "geographies",
"type": "simple table",
"table path": "main.geographic",
"unique properties": [
"city_name"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since each row represents a unique city, I'd advise changing the name of the collection to cities, and altering the description accordingly.

],
"properties": [
{
"name": "city_name",
"type": "table column",
"column name": "city_name",
"data type": "string",
"description": "The name of the city",
"sample values": ["Los Angeles", "Miami"],
"synonyms": ["city"]
},
{
"name": "county",
"type": "table column",
"column name": "county",
"data type": "string",
"description": "The name of the county",
"sample values": ["New York", "San Francisco"],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These county names are ALSO city names. Let's include more sample values that are unambiguously county names (make sure to update the other graphs): Miami-Dade, Cook

"synonyms": ["division"]
},
{
"name": "region",
"type": "table column",
"column name": "region",
"data type": "string",
"description": "The name of the region",
"sample values": ["California", "New York"],
"synonyms": ["territory"]
}
],
"description": "Contains records of geographic locations including city name, county and region",
"synonyms": ["locations", "places"]
},
{
"name": "locations",
"type": "simple table",
"table path": "main.location",
"unique properties": [
"restaurant_id"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't each combination of (house_number, street_name, city_name) also be unique?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean adding in unique properties this ["restaurant_id", ["house_number", "street_name", "city_name"]] or [["restaurant_id", "house_number", "street_name", "city_name"]] ? Will this change impact how the current queries are written?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first one:

"restaurant_id", ["house_number", "street_name", "city_name"]]

And no it won't necessarily change any of the current queries.

],
"properties": [
{
"name": "restaurant_id",
"type": "table column",
"column name": "restaurant_id",
"data type": "numeric",
"description": "Unique identifier for each restaurant",
"sample values": [1, 2, 3],
"synonyms": ["restaurant_id", "diner_id"]
},
{
"name": "house_number",
"type": "table column",
"column name": "house_number",
"data type": "numeric",
"description": "The number assigned to the building where the restaurant is located",
"sample values": [123, 789, 12],
"synonyms": ["street_number"]
},
{
"name": "street_name",
"type": "table column",
"column name": "street_name",
"data type": "string",
"description": "The name of the street where the restaurant is located",
"sample values": ["Main St", "Oak St", "Pine Ave"],
"synonyms": ["avenue"]
},
{
"name": "city_name",
"type": "table column",
"column name": "city_name",
"data type": "string",
"description": "The name of the city where the restaurant is located",
"sample values": ["New York", "Los Angeles", "Miami"],
"synonyms": []
}
],
"description": "Contains the location of each restaurant",
"synonyms": ["address"]
},
{
"name": "restaurants",
"type": "simple table",
"table path": "main.restaurant",
"unique properties": [
"id_"
],
"properties": [
{
"name": "id_",
"type": "table column",
"column name": "id",
"data type": "numeric",
"description": "Unique identifier for each restaurant",
"sample values": [1, 2, 3],
"synonyms": ["identifier"]
},
{
"name": "name",
"type": "table column",
"column name": "name",
"data type": "string",
"description": "The name of the restaurant",
"sample values": ["The Pasta House", "The Burger Joint", "The Seafood Shack"],
"synonyms": ["restaurant"]
},
{
"name": "food_type",
"type": "table column",
"column name": "food_type",
"data type": "string",
"description": "The type of food served at the restaurant",
"sample values": ["Seafood", "American", "Japanese"],
"synonyms": ["specialty", "menu type"]
},
{
"name": "city_name",
"type": "table column",
"column name": "city_name",
"data type": "string",
"description": "The city where the restaurant is located",
"sample values": ["San Francisco", "New York", "Miami"],
"synonyms": ["locality", "town"]
},
{
"name": "rating",
"type": "table column",
"column name": "rating",
"data type": "numeric",
"description": "The rating of the restaurant on a scale of 0 to 5",
"sample values": [4.2, 3.9, 4.5],
"synonyms": ["score", "review"]
}
],
"description": "Contains the information of the restaurants",
"synonyms": ["diner"]
}
],
"relationships": [
{
"type": "simple join",
"name": "locations",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd call this restaurant_locations since it is mapping a city to each restaurant location within the city.

"parent collection": "geographies",
"child collection": "locations",
"singular": false,
"always matches": false,
"keys": {
"city_name": [
"city_name"
]
},
"description": "All restaurant locations related to this geographic",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How I'd describe this: "All restaurant locations within the city"

"synonyms": ["restaurant locations"]
},
{
"type": "reverse",
"name": "geographic",
"original parent": "geographies",
"original property": "locations",
"singular": true,
"always matches": false,
"description": "The geographic linked with this location",
"synonyms": ["geography", "town"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can make this clearer:

  • Call the relationship city
  • Say that this relationship is "The geographic information for the city that the location belongs to"

},
{
"type": "simple join",
"name": "locations",
"parent collection": "restaurants",
"child collection": "locations",
"singular": false,
"always matches": false,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is incorrect: each restaurant has 1 location, and vice versa. The name should be singular location, both singular and always matches should be true, and the description/synonyms should change.

"keys": {
"id_": [
"restaurant_id"
]
},
"description": "All locations of this restaurant",
"synonyms": ["stores", "units"]
},
{
"type": "reverse",
"name": "restaurant",
"original parent": "restaurants",
"original property": "locations",
"singular": true,
"always matches": false,
"description": "The restaurant related to this location",
"synonyms": ["franchise", "chain"]
Comment on lines 4628 to 4634
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, all of these need to be changed.

},
{
"type": "simple join",
"name": "restaurants",
"parent collection": "geographies",
"child collection": "restaurants",
"singular": false,
"always matches": false,
"keys": {
"city_name": [
"city_name"
]
},
"description": "The restaurants located in this geographic territory",
"synonyms": ["diners"]
},
{
"type": "reverse",
"name": "geographic",
"original parent": "geographies",
"original property": "restaurants",
"singular": true,
"always matches": false,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

always matches should be true here, since every restaurant has a matching record in geographic.

"description": "The geographic territory related to this restaurant",
"synonyms": ["location", "territory"]
}
]
}
]
Loading