-
Notifications
You must be signed in to change notification settings - Fork 123
WIP: movie_json_data analysis #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 20 commits
abc5228
733d731
f28b89e
6611c9c
60c8749
c362e57
8f24778
fd4f923
984e281
6081887
b6f000d
5aa1779
bdbb2b5
c111967
828673a
c372cb0
46e5e1a
cfcaf23
4e1bec6
145e778
3624a7a
d9affe4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| /img/ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| import load_db as ldb | ||
| import sys | ||
|
|
||
| if __name__ == '__main__': | ||
| # execute only if run as the entry point into the program | ||
| ldb.main() | ||
| sys.exit(0) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,49 @@ | ||
| from configparser import ConfigParser | ||
|
|
||
| import pyodbc | ||
|
|
||
|
|
||
| def load_config(): | ||
| # Read configuration file | ||
| parser = ConfigParser() | ||
| parser.read('../dev.ini') | ||
| return parser | ||
|
|
||
|
|
||
| def get_SQLCONFIG(): | ||
| parser = load_config() | ||
| # Read corresponding file parameters | ||
| _driver = parser.get("db", "driver") | ||
| _database = parser.get("db", "database") | ||
| _trusted_connection = parser.get("db", "trusted_connection") | ||
| _server = parser.get("db", "server") | ||
| return _driver, _database, _trusted_connection, _server # return required parameters | ||
|
|
||
|
|
||
| def ms_sql_connection(): | ||
| c = get_SQLCONFIG() | ||
| driver_ = c[0] | ||
| db_name = c[1] | ||
| server_ = c[3] | ||
| conn_info = ('DRIVER=' + driver_ + ';TrustServerCertificate=No;' | ||
| 'DATABASE=' + db_name + ';SERVER=' + server_) | ||
| return pyodbc.connect(conn_info) | ||
|
|
||
|
|
||
| # load json path from configuration file | ||
| def json_path(): | ||
| parser = load_config() | ||
| return parser.get("json", "movie_list") | ||
|
|
||
| ############create .ini file | ||
|
|
||
| # config = ConfigParser() | ||
| # config['db'] = { | ||
| # "driver": "{ODBC Driver 17 for SQL Server}", | ||
| # "database": "Movies_DB", | ||
| # "trusted_Connection": "yes", | ||
| # "server": "(localdb)\MSSQLLocalDB" | ||
| # | ||
| # } | ||
| # with open('../dev.ini', 'w') as f: | ||
| # config.write(f) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| import json | ||
| import pandas as pd | ||
| import config_handler as ch | ||
|
|
||
|
|
||
| def load(): | ||
|
|
||
| with open(ch.json_path()) as f: | ||
| data = json.load(f) | ||
| # print(data) | ||
| df = pd.DataFrame(data) | ||
| return df | ||
|
|
||
|
|
||
| # JSON to DataFrame | ||
| def json_to_df(): | ||
| dataframe = load() | ||
|
|
||
| # type conversion | ||
| # list to string for movie_genre_relationship table | ||
| dataframe['genres'] = dataframe['genres'].astype('str').apply( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need this?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added relevant comments to all transformations. |
||
| lambda x: x.lower().strip().replace("[", "").replace("]", "") | ||
| .replace("\'", "").replace("\"", "").replace(", ", ",")) | ||
|
|
||
| # extract relevant duration PT89M --> 89 | ||
| dataframe['duration'] = dataframe['duration'].astype('str').apply( | ||
| lambda x: x.strip().replace("PT", "").replace("M", "")).astype(int) | ||
|
|
||
| # string to float conversion | ||
| dataframe['imdbRating'] = dataframe['imdbRating'].astype('float') | ||
|
|
||
| # handling names like Genelia D'Souza which was causing string handling issues | ||
| dataframe['actors'] = dataframe['actors'].astype('str').apply( | ||
| lambda x: x.lower().strip().replace("[", "").replace("]", ""). | ||
| replace("\'", "").replace("\"", "").replace(", ", ",")) | ||
| return dataframe | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| import pyodbc | ||
| import teardown as td | ||
| import extract_transform as et | ||
| import config_handler as db_ | ||
|
|
||
|
|
||
| # import pyodbc | ||
| # from configparser import ConfigParser | ||
|
|
||
|
|
||
| def main(): | ||
| # For rerun purposes (incase of new daily feed) | ||
| td.db_cleanup() | ||
|
|
||
| # Extract Json and transform | ||
| dataframe = et.json_to_df() | ||
|
|
||
| # write to MS SQL db | ||
| conn = db_.ms_sql_connection() | ||
| cursor = conn.cursor() | ||
| # Create Tables | ||
| cursor.execute("""CREATE TABLE [Top_rated_Movie] ( | ||
| Id int NOT NULL CONSTRAINT [Id] PRIMARY KEY, | ||
| title char (100) NULL , | ||
| [year] [date] NULL , | ||
| [genres] [nvarchar] (50) NULL , | ||
| [duration] [int] NULL , | ||
| [releaseDate] [nvarchar] (50) NULL , | ||
| [actors] [nvarchar] (500) NULL , | ||
| [imdbRating] [varchar] (50) NULL , | ||
| ) ON [PRIMARY] """) | ||
|
|
||
| cursor.execute("""CREATE TABLE [Movie_Actor_Relationship] ( | ||
| [movieID] [int] NOT NULL , | ||
| [actor] [nvarchar] (500) NULL , | ||
| [imdbRating] [float] (50) NULL ,) | ||
| """) | ||
|
|
||
| cursor.execute(""" | ||
| CREATE TABLE [Movie_Genre_Relationship] ( | ||
| [movieID] [int] NOT NULL , | ||
| [genre] [nvarchar] (50) NULL , | ||
| [imdbRating] [float] (50) NULL , | ||
| )""") | ||
| conn.commit() | ||
|
|
||
| # Inserting data in SQL Table:- | ||
| for index, row in dataframe.iterrows(): | ||
| cursor.execute( | ||
|
||
| """INSERT INTO dbo.Top_rated_Movie(Id,title,year,genres,actors,duration, | ||
| releaseDate,imdbRating) values (?,?,?,?,?,?,?,?)""", | ||
| index, row.title, row.year, row['genres'], row['actors'], row.duration, | ||
| row.releaseDate, row.imdbRating) | ||
| for actor in row['actors'].split(","): | ||
| cursor.execute("""INSERT INTO dbo.Movie_Actor_Relationship(movieID,actor,imdbRating) values (?,?,?)""", | ||
| index, actor, row.imdbRating) | ||
| for genre in row['genres'].split(","): | ||
| cursor.execute("""INSERT INTO dbo.Movie_Genre_Relationship(movieID,genre,imdbRating) values (?,?,?)""", | ||
| index, genre, row.imdbRating) | ||
|
|
||
| conn.commit() | ||
| cursor.close() | ||
| conn.close() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| import config_handler as db | ||
|
|
||
|
|
||
| def db_cleanup(): | ||
| conn = db.ms_sql_connection() | ||
| cursor = conn.cursor() | ||
| cursor.execute("DROP TABLE IF EXISTS [dbo].[Top_rated_Movie]") | ||
| cursor.execute("DROP TABLE IF EXISTS [dbo].[Movie_Actor_Relationship]") | ||
| cursor.execute("DROP TABLE IF EXISTS [dbo].[Movie_Genre_Relationship]") | ||
| cursor.commit() | ||
| conn.close() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| [db] | ||
| driver = {ODBC Driver 17 for SQL Server} | ||
| database = Movies_DB | ||
| trusted_connection = yes | ||
| server = (localdb)\MSSQLLocalDB | ||
|
|
||
| [json] | ||
| movie_list= ../json/top-rated-movies-02.json |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you prepend local variables with underscore?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated