Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
abc5228
adding to gitignore
dayanajoseph3091 Dec 13, 2020
733d731
Parse JSON to dataframe
dayanajoseph3091 Dec 13, 2020
f28b89e
Parse JSON to dataframe
dayanajoseph3091 Dec 15, 2020
6611c9c
SQL Schema
dayanajoseph3091 Dec 15, 2020
60c8749
Merge pull request #1 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 15, 2020
c362e57
remove unused code
dayanajoseph3091 Dec 15, 2020
8f24778
Merge pull request #2 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 15, 2020
fd4f923
tableau_reports
dayanajoseph3091 Dec 17, 2020
984e281
Tableau based modeling and reporting
dayanajoseph3091 Dec 17, 2020
6081887
Merge pull request #3 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 17, 2020
b6f000d
adding end of file instruction
dayanajoseph3091 Dec 17, 2020
5aa1779
Merge pull request #4 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 17, 2020
bdbb2b5
added config_handler.py for reading db configuration file dev.ini
dayanajoseph3091 Dec 18, 2020
c111967
Adding Tableau file t
dayanajoseph3091 Dec 18, 2020
828673a
added config_handler.py for reading db configuration file dev.ini
dayanajoseph3091 Dec 18, 2020
c372cb0
Merge remote-tracking branch 'origin/feature/dayana' into feature/dayana
dayanajoseph3091 Dec 18, 2020
46e5e1a
Merge pull request #5 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 18, 2020
cfcaf23
added json path in config_handler.py for reading json file path from …
dayanajoseph3091 Dec 18, 2020
4e1bec6
Merge pull request #6 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 18, 2020
145e778
Add files via upload
dayanajoseph3091 Dec 18, 2020
3624a7a
addressing review comments
dayanajoseph3091 Dec 21, 2020
d9affe4
Merge pull request #7 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 21, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/img/
4 changes: 4 additions & 0 deletions code/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import load_db as ldb
if __name__ == '__main__':
# execute only if run as the entry point into the program
ldb.main()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing end of line.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added end of line.

33 changes: 33 additions & 0 deletions code/extract_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import json
import pandas as pd


# def load_json(data):
# Load JSON
#x=pd.DataFrame()
def load():
with open("../json/top-rated-movies-02.json") as f:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't hardcode names, rather would pass the file name as a parameter.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved the same to dev.ini

data = json.load(f)
# print(data)
df = pd.DataFrame(data)
return df

#print(load(x))
# JSON to DataFrame
def json_to_df():
dataframe = load()
print(dataframe)
#dataframe
# type conversion
dataframe['genres'] = dataframe['genres'].astype('str').apply(
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added relevant comments to all transformations.

lambda x: x.lower().strip().replace("[", "").replace("]", "").replace("\'", "").replace("\"", "").replace(", ",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please keep lines under 80 characters long.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated accordingly

","))
dataframe['ratings'] = dataframe['ratings'].astype('str')

dataframe['duration'] = dataframe['duration'].astype('str').apply(
lambda x: x.strip().replace("PT", "").replace("M", "")).astype(int)
dataframe['imdbRating'] = dataframe['imdbRating'].astype('float')
dataframe['actors'] = dataframe['actors'].astype('str').apply(
lambda x: x.lower().strip().replace("[", "").replace("]", "").replace("\'", "").replace("\"", "").replace(", ",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is a purpose of this transformation?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extract relevant duration PT89M --> 89
handling names like Genelia D'Souza which was causing string handling issues
added relevant comments to all transformations.

","))
return dataframe
69 changes: 69 additions & 0 deletions code/load_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import pyodbc
import teardown as db
import extract_transform as dp


def main():
#For rerun purposes (incase of new daily feed)
db.dbcleanup()

#Extract Json and transform
dataframe = dp.json_to_df()

# DB_Connection
conn = pyodbc.connect(
'DRIVER={ODBC Driver 17 for SQL Server};TrustServerCertificate=No;DATABASE=Movies_DB;WSID=LAPTOP-BLDSMT2E;APP={Microsoft® Windows® Operating System};Trusted_Connection=Yes;SERVER=(localdb)\MSSQLLocalDB;Description=movies')
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you hardcode the DB connection string?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved configuration to dev.ini

# create the connection cursor
cursor = conn.cursor()
# Create Tables
cursor.execute('\n'
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use multiline string literals.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.

'\n'
' CREATE TABLE [Top_rated_Movie] (\n'
' Id int NOT NULL CONSTRAINT [Id] PRIMARY KEY,\n '
'title char (100) NULL ,\n'
' [year] [date] NULL ,\n'
'[genres] [nvarchar] (50) NULL ,\n'
'[duration] [int] NULL ,\n'
'[releaseDate] [nvarchar] (50) NULL ,\n'
'[actors] [nvarchar] (500) NULL ,\n'
'[imdbRating] [varchar] (50) NULL ,\n'
') ON [PRIMARY]\n'
'\n'
' ')

cursor.execute('\n'
'\n'
' CREATE TABLE [Movie_Actor_Relationship] (\n'
' [movieID] [int] NOT NULL , '
' [actor] [nvarchar] (500) NULL ,\n'
' [imdbRating] [float] (50) NULL ,\n'
') \n'
'\n'
' ')

cursor.execute('\n'
'\n'
' CREATE TABLE [Movie_Genre_Relationship] (\n'
' [movieID] [int] NOT NULL , '
' [genre] [nvarchar] (50) NULL ,\n'
' [imdbRating] [float] (50) NULL ,\n'
') \n'
'\n'
' ')
conn.commit()
# Inserting data in SQL Table:-

for index, row in dataframe.iterrows():
cursor.execute(
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess here you compile SQL-query on each iteration. Maybe it's cached internally, but typically it makes sense to create pre-compiled queries.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated

"INSERT INTO dbo.Top_rated_Movie(Id,title,year,genres,actors,duration,releaseDate,imdbRating) values (?,?,?,?,?,?,?,?)",
index, row.title, row.year, row['genres'], row['actors'], row.duration, row.releaseDate, row.imdbRating)
for actor in row['actors'].split(","):
cursor.execute("INSERT INTO dbo.Movie_Actor_Relationship(movieID,actor,imdbRating) values (?,?,?)", index,
actor, row.imdbRating)
for genre in row['genres'].split(","):
cursor.execute("INSERT INTO dbo.Movie_Genre_Relationship(movieID,genre,imdbRating) values (?,?,?)", index,
genre, row.imdbRating)

conn.commit()
cursor.close()
conn.close()
20 changes: 20 additions & 0 deletions code/teardown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import pandas as pd
import json

import pyodbc
from dask.dataframe.methods import values

import pyodbc


def dbcleanup():
conn = pyodbc.connect(
'DRIVER={ODBC Driver 17 for SQL Server};TrustServerCertificate=No;DATABASE=Movies_DB;WSID=LAPTOP-BLDSMT2E;APP={Microsoft® Windows® Operating System};Trusted_Connection=Yes;SERVER=(localdb)\MSSQLLocalDB;Description=movies')

cursor = conn.cursor()

cursor.execute("DROP TABLE IF EXISTS [dbo].[Top_rated_Movie]")
cursor.execute("DROP TABLE IF EXISTS [dbo].[Movie_Actor_Relationship]")
cursor.execute("DROP TABLE IF EXISTS [dbo].[Movie_Genre_Relationship]")
cursor.commit()
conn.close()
Loading