Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
abc5228
adding to gitignore
dayanajoseph3091 Dec 13, 2020
733d731
Parse JSON to dataframe
dayanajoseph3091 Dec 13, 2020
f28b89e
Parse JSON to dataframe
dayanajoseph3091 Dec 15, 2020
6611c9c
SQL Schema
dayanajoseph3091 Dec 15, 2020
60c8749
Merge pull request #1 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 15, 2020
c362e57
remove unused code
dayanajoseph3091 Dec 15, 2020
8f24778
Merge pull request #2 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 15, 2020
fd4f923
tableau_reports
dayanajoseph3091 Dec 17, 2020
984e281
Tableau based modeling and reporting
dayanajoseph3091 Dec 17, 2020
6081887
Merge pull request #3 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 17, 2020
b6f000d
adding end of file instruction
dayanajoseph3091 Dec 17, 2020
5aa1779
Merge pull request #4 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 17, 2020
bdbb2b5
added config_handler.py for reading db configuration file dev.ini
dayanajoseph3091 Dec 18, 2020
c111967
Adding Tableau file t
dayanajoseph3091 Dec 18, 2020
828673a
added config_handler.py for reading db configuration file dev.ini
dayanajoseph3091 Dec 18, 2020
c372cb0
Merge remote-tracking branch 'origin/feature/dayana' into feature/dayana
dayanajoseph3091 Dec 18, 2020
46e5e1a
Merge pull request #5 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 18, 2020
cfcaf23
added json path in config_handler.py for reading json file path from …
dayanajoseph3091 Dec 18, 2020
4e1bec6
Merge pull request #6 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 18, 2020
145e778
Add files via upload
dayanajoseph3091 Dec 18, 2020
3624a7a
addressing review comments
dayanajoseph3091 Dec 21, 2020
d9affe4
Merge pull request #7 from dayanajoseph3091/feature/dayana
dayanajoseph3091 Dec 21, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/img/
Binary file added Insights.pptx
Binary file not shown.
7 changes: 7 additions & 0 deletions code/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import load_db as ldb
import sys

if __name__ == '__main__':
# execute only if run as the entry point into the program
ldb.main()
sys.exit(0)
49 changes: 49 additions & 0 deletions code/config_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from configparser import ConfigParser

import pyodbc


def load_config():
# Read configuration file
parser = ConfigParser()
parser.read('../dev.ini')
return parser


def get_SqlConfig():
parser = load_config()
# Read corresponding file parameters
driver = parser.get("db", "driver")
database = parser.get("db", "database")
trusted_connection = parser.get("db", "trusted_connection")
server = parser.get("db", "server")
return driver, database, trusted_connection, server # return required parameters


def ms_sql_connection():
c = get_SqlConfig()
driver_ = c[0]
db_name = c[1]
server_ = c[3]
conn_info = ('DRIVER=' + driver_ + ';TrustServerCertificate=No;'
'DATABASE=' + db_name + ';SERVER=' + server_)
return pyodbc.connect(conn_info)


# load json path from configuration file
def json_path():
parser = load_config()
return parser.get("json", "movie_list")

############create .ini file

# config = ConfigParser()
# config['db'] = {
# "driver": "{ODBC Driver 17 for SQL Server}",
# "database": "Movies_DB",
# "trusted_Connection": "yes",
# "server": "(localdb)\MSSQLLocalDB"
#
# }
# with open('../dev.ini', 'w') as f:
# config.write(f)
36 changes: 36 additions & 0 deletions code/extract_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import json
import pandas as pd
import config_handler as ch


def load():

with open(ch.json_path()) as f:
data = json.load(f)
# print(data)
df = pd.DataFrame(data)
return df


# JSON to DataFrame
def json_to_df():
dataframe = load()

# type conversion
# list to string for movie_genre_relationship table
dataframe['genres'] = dataframe['genres'].astype('str').apply(
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added relevant comments to all transformations.

lambda x: x.lower().strip().replace("[", "").replace("]", "")
.replace("\'", "").replace("\"", "").replace(", ", ","))

# extract relevant duration PT89M --> 89
dataframe['duration'] = dataframe['duration'].astype('str').apply(
lambda x: x.strip().replace("PT", "").replace("M", "")).astype(int)

# string to float conversion
dataframe['imdbRating'] = dataframe['imdbRating'].astype('float')

# handling names like Genelia D'Souza which was causing string handling issues
dataframe['actors'] = dataframe['actors'].astype('str').apply(
lambda x: x.lower().strip().replace("[", "").replace("]", "").
replace("\'", "").replace("\"", "").replace(", ", ","))
return dataframe
69 changes: 69 additions & 0 deletions code/load_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import pyodbc
import teardown as td
import extract_transform as et
import config_handler as db_


# import pyodbc
# from configparser import ConfigParser


def main():
# For rerun purposes (incase of new daily feed)
td.db_cleanup()

# Extract Json and transform
dataframe = et.json_to_df()
# Queries
create_top_rated_movie = """CREATE TABLE [Top_rated_Movie] (
Id int NOT NULL CONSTRAINT [Id] PRIMARY KEY,
title char (100) NULL ,
[year] [date] NULL ,
[genres] [nvarchar] (50) NULL ,
[duration] [int] NULL ,
[releaseDate] [nvarchar] (50) NULL ,
[actors] [nvarchar] (500) NULL ,
[imdbRating] [varchar] (50) NULL ,
) ON [PRIMARY] """
create_movie_actor_relationship = """CREATE TABLE [Movie_Actor_Relationship] (
[movieID] [int] NOT NULL ,
[actor] [nvarchar] (500) NULL ,
[imdbRating] [float] (50) NULL ,)
"""
create_movie_genre_relationship = """
CREATE TABLE [Movie_Genre_Relationship] (
[movieID] [int] NOT NULL ,
[genre] [nvarchar] (50) NULL ,
[imdbRating] [float] (50) NULL ,
)"""
# write to MS SQL db
create_query = '{0};{1};{2}'.format(create_top_rated_movie, create_movie_actor_relationship,
create_movie_genre_relationship)
conn = db_.ms_sql_connection()
cursor = conn.cursor()
# Create Tables
cursor.execute(create_query)
conn.commit()

# Inserting data in SQL Table:-
insert_top_rated_movie = """INSERT INTO dbo.Top_rated_Movie(Id,title,year,genres,actors,duration,
releaseDate,imdbRating) values (?,?,?,?,?,?,?,?)"""
insert_movie_actor_relationship = """INSERT INTO dbo.Movie_Actor_Relationship
(movieID,actor,imdbRating) values (?,?,?) """
insert_movie_genre_relationship = """INSERT INTO dbo.Movie_Genre_Relationship
(movieID,genre,imdbRating) values (?,?,?)"""
for index, row in dataframe.iterrows():

cursor.execute(insert_top_rated_movie,
index, row.title, row.year, row['genres'], row['actors'], row.duration,
row.releaseDate, row.imdbRating)
for actor in row['actors'].split(","):
cursor.execute(insert_movie_actor_relationship,
index, actor, row.imdbRating)
for genre in row['genres'].split(","):
cursor.execute(insert_movie_genre_relationship,
index, genre, row.imdbRating)

conn.commit()
cursor.close()
conn.close()
12 changes: 12 additions & 0 deletions code/teardown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import config_handler as db


def db_cleanup():
conn = db.ms_sql_connection()
cursor = conn.cursor()
drop_tables = "DROP TABLE IF EXISTS [dbo].[Top_rated_Movie];" \
+ "DROP TABLE IF EXISTS [dbo].[Movie_Actor_Relationship];" \
+ "DROP TABLE IF EXISTS [dbo].[Movie_Genre_Relationship]"
cursor.execute(drop_tables)
cursor.commit()
conn.close()
8 changes: 8 additions & 0 deletions dev.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[db]
driver = {ODBC Driver 17 for SQL Server}
database = Movies_DB
trusted_connection = yes
server = (localdb)\MSSQLLocalDB

[json]
movie_list= ../json/top-rated-movies-02.json
Binary file added modeling_reporting/Insights.twbx
Binary file not shown.
Binary file added modeling_reporting/upwork.pptx
Binary file not shown.
Loading