diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8109293 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/img/ diff --git a/Insights.pptx b/Insights.pptx new file mode 100644 index 0000000..429b9ca Binary files /dev/null and b/Insights.pptx differ diff --git a/code/__main__.py b/code/__main__.py new file mode 100644 index 0000000..62d5243 --- /dev/null +++ b/code/__main__.py @@ -0,0 +1,7 @@ +import load_db as ldb +import sys + +if __name__ == '__main__': + # execute only if run as the entry point into the program + ldb.main() + sys.exit(0) diff --git a/code/config_handler.py b/code/config_handler.py new file mode 100644 index 0000000..828e501 --- /dev/null +++ b/code/config_handler.py @@ -0,0 +1,49 @@ +from configparser import ConfigParser + +import pyodbc + + +def load_config(): + # Read configuration file + parser = ConfigParser() + parser.read('../dev.ini') + return parser + + +def get_SqlConfig(): + parser = load_config() + # Read corresponding file parameters + driver = parser.get("db", "driver") + database = parser.get("db", "database") + trusted_connection = parser.get("db", "trusted_connection") + server = parser.get("db", "server") + return driver, database, trusted_connection, server # return required parameters + + +def ms_sql_connection(): + c = get_SqlConfig() + driver_ = c[0] + db_name = c[1] + server_ = c[3] + conn_info = ('DRIVER=' + driver_ + ';TrustServerCertificate=No;' + 'DATABASE=' + db_name + ';SERVER=' + server_) + return pyodbc.connect(conn_info) + + +# load json path from configuration file +def json_path(): + parser = load_config() + return parser.get("json", "movie_list") + +############create .ini file + +# config = ConfigParser() +# config['db'] = { +# "driver": "{ODBC Driver 17 for SQL Server}", +# "database": "Movies_DB", +# "trusted_Connection": "yes", +# "server": "(localdb)\MSSQLLocalDB" +# +# } +# with open('../dev.ini', 'w') as f: +# config.write(f) diff --git a/code/extract_transform.py b/code/extract_transform.py new file mode 100644 index 0000000..5380a0a --- /dev/null +++ b/code/extract_transform.py @@ -0,0 +1,36 @@ +import json +import pandas as pd +import config_handler as ch + + +def load(): + + with open(ch.json_path()) as f: + data = json.load(f) + # print(data) + df = pd.DataFrame(data) + return df + + +# JSON to DataFrame +def json_to_df(): + dataframe = load() + + # type conversion + # list to string for movie_genre_relationship table + dataframe['genres'] = dataframe['genres'].astype('str').apply( + lambda x: x.lower().strip().replace("[", "").replace("]", "") + .replace("\'", "").replace("\"", "").replace(", ", ",")) + + # extract relevant duration PT89M --> 89 + dataframe['duration'] = dataframe['duration'].astype('str').apply( + lambda x: x.strip().replace("PT", "").replace("M", "")).astype(int) + + # string to float conversion + dataframe['imdbRating'] = dataframe['imdbRating'].astype('float') + + # handling names like Genelia D'Souza which was causing string handling issues + dataframe['actors'] = dataframe['actors'].astype('str').apply( + lambda x: x.lower().strip().replace("[", "").replace("]", ""). + replace("\'", "").replace("\"", "").replace(", ", ",")) + return dataframe diff --git a/code/load_db.py b/code/load_db.py new file mode 100644 index 0000000..1ba4d15 --- /dev/null +++ b/code/load_db.py @@ -0,0 +1,69 @@ +import pyodbc +import teardown as td +import extract_transform as et +import config_handler as db_ + + +# import pyodbc +# from configparser import ConfigParser + + +def main(): + # For rerun purposes (incase of new daily feed) + td.db_cleanup() + + # Extract Json and transform + dataframe = et.json_to_df() + # Queries + create_top_rated_movie = """CREATE TABLE [Top_rated_Movie] ( + Id int NOT NULL CONSTRAINT [Id] PRIMARY KEY, + title char (100) NULL , + [year] [date] NULL , + [genres] [nvarchar] (50) NULL , + [duration] [int] NULL , + [releaseDate] [nvarchar] (50) NULL , + [actors] [nvarchar] (500) NULL , + [imdbRating] [varchar] (50) NULL , + ) ON [PRIMARY] """ + create_movie_actor_relationship = """CREATE TABLE [Movie_Actor_Relationship] ( + [movieID] [int] NOT NULL , + [actor] [nvarchar] (500) NULL , + [imdbRating] [float] (50) NULL ,) + """ + create_movie_genre_relationship = """ + CREATE TABLE [Movie_Genre_Relationship] ( + [movieID] [int] NOT NULL , + [genre] [nvarchar] (50) NULL , + [imdbRating] [float] (50) NULL , + )""" + # write to MS SQL db + create_query = '{0};{1};{2}'.format(create_top_rated_movie, create_movie_actor_relationship, + create_movie_genre_relationship) + conn = db_.ms_sql_connection() + cursor = conn.cursor() + # Create Tables + cursor.execute(create_query) + conn.commit() + + # Inserting data in SQL Table:- + insert_top_rated_movie = """INSERT INTO dbo.Top_rated_Movie(Id,title,year,genres,actors,duration, + releaseDate,imdbRating) values (?,?,?,?,?,?,?,?)""" + insert_movie_actor_relationship = """INSERT INTO dbo.Movie_Actor_Relationship + (movieID,actor,imdbRating) values (?,?,?) """ + insert_movie_genre_relationship = """INSERT INTO dbo.Movie_Genre_Relationship + (movieID,genre,imdbRating) values (?,?,?)""" + for index, row in dataframe.iterrows(): + + cursor.execute(insert_top_rated_movie, + index, row.title, row.year, row['genres'], row['actors'], row.duration, + row.releaseDate, row.imdbRating) + for actor in row['actors'].split(","): + cursor.execute(insert_movie_actor_relationship, + index, actor, row.imdbRating) + for genre in row['genres'].split(","): + cursor.execute(insert_movie_genre_relationship, + index, genre, row.imdbRating) + + conn.commit() + cursor.close() + conn.close() diff --git a/code/teardown.py b/code/teardown.py new file mode 100644 index 0000000..eae300c --- /dev/null +++ b/code/teardown.py @@ -0,0 +1,12 @@ +import config_handler as db + + +def db_cleanup(): + conn = db.ms_sql_connection() + cursor = conn.cursor() + drop_tables = "DROP TABLE IF EXISTS [dbo].[Top_rated_Movie];" \ + + "DROP TABLE IF EXISTS [dbo].[Movie_Actor_Relationship];" \ + + "DROP TABLE IF EXISTS [dbo].[Movie_Genre_Relationship]" + cursor.execute(drop_tables) + cursor.commit() + conn.close() diff --git a/dev.ini b/dev.ini new file mode 100644 index 0000000..dc1ca03 --- /dev/null +++ b/dev.ini @@ -0,0 +1,8 @@ +[db] +driver = {ODBC Driver 17 for SQL Server} +database = Movies_DB +trusted_connection = yes +server = (localdb)\MSSQLLocalDB + +[json] +movie_list= ../json/top-rated-movies-02.json \ No newline at end of file diff --git a/modeling_reporting/Insights.twbx b/modeling_reporting/Insights.twbx new file mode 100644 index 0000000..e504553 Binary files /dev/null and b/modeling_reporting/Insights.twbx differ diff --git a/modeling_reporting/upwork.pptx b/modeling_reporting/upwork.pptx new file mode 100644 index 0000000..88586b9 Binary files /dev/null and b/modeling_reporting/upwork.pptx differ diff --git a/schema/Publish.sql b/schema/Publish.sql new file mode 100644 index 0000000..2900a41 --- /dev/null +++ b/schema/Publish.sql @@ -0,0 +1,310 @@ +/* +Deployment script for NEW + +This code was generated by a tool. +Changes to this file may cause incorrect behavior and will be lost if +the code is regenerated. +*/ + +GO +SET ANSI_NULLS, ANSI_PADDING, ANSI_WARNINGS, ARITHABORT, CONCAT_NULL_YIELDS_NULL, QUOTED_IDENTIFIER ON; + +SET NUMERIC_ROUNDABORT OFF; + + +GO +:setvar DatabaseName "NEW" +:setvar DefaultFilePrefix "NEW" +:setvar DefaultDataPath "C:\Users\b_kir\AppData\Local\Microsoft\Microsoft SQL Server Local DB\Instances\MSSQLLocalDB\" +:setvar DefaultLogPath "C:\Users\b_kir\AppData\Local\Microsoft\Microsoft SQL Server Local DB\Instances\MSSQLLocalDB\" + +GO +:on error exit +GO +/* +Detect SQLCMD mode and disable script execution if SQLCMD mode is not supported. +To re-enable the script after enabling SQLCMD mode, execute the following: +SET NOEXEC OFF; +*/ +:setvar __IsSqlCmdEnabled "True" +GO +IF N'$(__IsSqlCmdEnabled)' NOT LIKE N'True' + BEGIN + PRINT N'SQLCMD mode must be enabled to successfully execute this script.'; + SET NOEXEC ON; + END + + +GO +USE [master]; + + +GO + +IF (DB_ID(N'$(DatabaseName)') IS NOT NULL) +BEGIN + ALTER DATABASE [$(DatabaseName)] + SET SINGLE_USER WITH ROLLBACK IMMEDIATE; + DROP DATABASE [$(DatabaseName)]; +END + +GO +PRINT N'Creating $(DatabaseName)...' +GO +CREATE DATABASE [$(DatabaseName)] + ON + PRIMARY(NAME = [$(DatabaseName)], FILENAME = N'$(DefaultDataPath)$(DefaultFilePrefix)_Primary.mdf') + LOG ON (NAME = [$(DatabaseName)_log], FILENAME = N'$(DefaultLogPath)$(DefaultFilePrefix)_Primary.ldf') COLLATE SQL_Latin1_General_CP1_CI_AS +GO +IF EXISTS (SELECT 1 + FROM [master].[dbo].[sysdatabases] + WHERE [name] = N'$(DatabaseName)') + BEGIN + ALTER DATABASE [$(DatabaseName)] + SET AUTO_CLOSE OFF + WITH ROLLBACK IMMEDIATE; + END + + +GO +USE [$(DatabaseName)]; + + +GO +IF EXISTS (SELECT 1 + FROM [master].[dbo].[sysdatabases] + WHERE [name] = N'$(DatabaseName)') + BEGIN + ALTER DATABASE [$(DatabaseName)] + SET ANSI_NULLS ON, + ANSI_PADDING ON, + ANSI_WARNINGS ON, + ARITHABORT ON, + CONCAT_NULL_YIELDS_NULL ON, + NUMERIC_ROUNDABORT OFF, + QUOTED_IDENTIFIER ON, + ANSI_NULL_DEFAULT ON, + CURSOR_DEFAULT LOCAL, + RECOVERY FULL, + CURSOR_CLOSE_ON_COMMIT OFF, + AUTO_CREATE_STATISTICS ON, + AUTO_SHRINK OFF, + AUTO_UPDATE_STATISTICS ON, + RECURSIVE_TRIGGERS OFF + WITH ROLLBACK IMMEDIATE; + END + + +GO +IF EXISTS (SELECT 1 + FROM [master].[dbo].[sysdatabases] + WHERE [name] = N'$(DatabaseName)') + BEGIN + ALTER DATABASE [$(DatabaseName)] + SET ALLOW_SNAPSHOT_ISOLATION OFF; + END + + +GO +IF EXISTS (SELECT 1 + FROM [master].[dbo].[sysdatabases] + WHERE [name] = N'$(DatabaseName)') + BEGIN + ALTER DATABASE [$(DatabaseName)] + SET READ_COMMITTED_SNAPSHOT OFF + WITH ROLLBACK IMMEDIATE; + END + + +GO +IF EXISTS (SELECT 1 + FROM [master].[dbo].[sysdatabases] + WHERE [name] = N'$(DatabaseName)') + BEGIN + ALTER DATABASE [$(DatabaseName)] + SET AUTO_UPDATE_STATISTICS_ASYNC OFF, + PAGE_VERIFY CHECKSUM, + DATE_CORRELATION_OPTIMIZATION OFF, + DISABLE_BROKER, + PARAMETERIZATION SIMPLE, + SUPPLEMENTAL_LOGGING OFF + WITH ROLLBACK IMMEDIATE; + END + + +GO +IF IS_SRVROLEMEMBER(N'sysadmin') = 1 + BEGIN + IF EXISTS (SELECT 1 + FROM [master].[dbo].[sysdatabases] + WHERE [name] = N'$(DatabaseName)') + BEGIN + EXECUTE sp_executesql N'ALTER DATABASE [$(DatabaseName)] + SET TRUSTWORTHY OFF, + DB_CHAINING OFF + WITH ROLLBACK IMMEDIATE'; + END + END +ELSE + BEGIN + PRINT N'The database settings cannot be modified. You must be a SysAdmin to apply these settings.'; + END + + +GO +IF IS_SRVROLEMEMBER(N'sysadmin') = 1 + BEGIN + IF EXISTS (SELECT 1 + FROM [master].[dbo].[sysdatabases] + WHERE [name] = N'$(DatabaseName)') + BEGIN + EXECUTE sp_executesql N'ALTER DATABASE [$(DatabaseName)] + SET HONOR_BROKER_PRIORITY OFF + WITH ROLLBACK IMMEDIATE'; + END + END +ELSE + BEGIN + PRINT N'The database settings cannot be modified. You must be a SysAdmin to apply these settings.'; + END + + +GO +ALTER DATABASE [$(DatabaseName)] + SET TARGET_RECOVERY_TIME = 60 SECONDS + WITH ROLLBACK IMMEDIATE; + + +GO +IF EXISTS (SELECT 1 + FROM [master].[dbo].[sysdatabases] + WHERE [name] = N'$(DatabaseName)') + BEGIN + ALTER DATABASE [$(DatabaseName)] + SET FILESTREAM(NON_TRANSACTED_ACCESS = OFF), + CONTAINMENT = NONE + WITH ROLLBACK IMMEDIATE; + END + + +GO +IF EXISTS (SELECT 1 + FROM [master].[dbo].[sysdatabases] + WHERE [name] = N'$(DatabaseName)') + BEGIN + ALTER DATABASE [$(DatabaseName)] + SET AUTO_CREATE_STATISTICS ON(INCREMENTAL = OFF), + MEMORY_OPTIMIZED_ELEVATE_TO_SNAPSHOT = OFF, + DELAYED_DURABILITY = DISABLED + WITH ROLLBACK IMMEDIATE; + END + + +GO +IF EXISTS (SELECT 1 + FROM [master].[dbo].[sysdatabases] + WHERE [name] = N'$(DatabaseName)') + BEGIN + ALTER DATABASE [$(DatabaseName)] + SET QUERY_STORE (QUERY_CAPTURE_MODE = ALL, DATA_FLUSH_INTERVAL_SECONDS = 900, INTERVAL_LENGTH_MINUTES = 60, MAX_PLANS_PER_QUERY = 200, CLEANUP_POLICY = (STALE_QUERY_THRESHOLD_DAYS = 30), MAX_STORAGE_SIZE_MB = 100) + WITH ROLLBACK IMMEDIATE; + END + + +GO +IF EXISTS (SELECT 1 + FROM [master].[dbo].[sysdatabases] + WHERE [name] = N'$(DatabaseName)') + BEGIN + ALTER DATABASE [$(DatabaseName)] + SET QUERY_STORE = OFF + WITH ROLLBACK IMMEDIATE; + END + + +GO +IF EXISTS (SELECT 1 + FROM [master].[dbo].[sysdatabases] + WHERE [name] = N'$(DatabaseName)') + BEGIN + ALTER DATABASE SCOPED CONFIGURATION SET MAXDOP = 0; + ALTER DATABASE SCOPED CONFIGURATION FOR SECONDARY SET MAXDOP = PRIMARY; + ALTER DATABASE SCOPED CONFIGURATION SET LEGACY_CARDINALITY_ESTIMATION = OFF; + ALTER DATABASE SCOPED CONFIGURATION FOR SECONDARY SET LEGACY_CARDINALITY_ESTIMATION = PRIMARY; + ALTER DATABASE SCOPED CONFIGURATION SET PARAMETER_SNIFFING = ON; + ALTER DATABASE SCOPED CONFIGURATION FOR SECONDARY SET PARAMETER_SNIFFING = PRIMARY; + ALTER DATABASE SCOPED CONFIGURATION SET QUERY_OPTIMIZER_HOTFIXES = OFF; + ALTER DATABASE SCOPED CONFIGURATION FOR SECONDARY SET QUERY_OPTIMIZER_HOTFIXES = PRIMARY; + END + + +GO +IF fulltextserviceproperty(N'IsFulltextInstalled') = 1 + EXECUTE sp_fulltext_database 'enable'; + + +GO +PRINT N'Creating [dbo].[Movie_Actor_Relationship]...'; + + +GO +CREATE TABLE [dbo].[Movie_Actor_Relationship] ( + [movieID] INT NOT NULL, + [actor] NVARCHAR (500) NULL, + [imdbRating] FLOAT (53) NULL +); + + +GO +PRINT N'Creating [dbo].[Movie_Genre_Relationship]...'; + + +GO +CREATE TABLE [dbo].[Movie_Genre_Relationship] ( + [movieID] INT NOT NULL, + [genre] NVARCHAR (50) NULL, + [imdbRating] FLOAT (53) NULL +); + + +GO +PRINT N'Creating [dbo].[Top_rated_Movie]...'; + + +GO +CREATE TABLE [dbo].[Top_rated_Movie] ( + [Id] INT NOT NULL, + [title] CHAR (100) NULL, + [year] DATE NULL, + [genres] NVARCHAR (50) NULL, + [duration] INT NULL, + [releaseDate] NVARCHAR (50) NULL, + [actors] NVARCHAR (500) NULL, + [imdbRating] VARCHAR (50) NULL, + CONSTRAINT [Id] PRIMARY KEY CLUSTERED ([Id] ASC) +); + + +GO +DECLARE @VarDecimalSupported AS BIT; + +SELECT @VarDecimalSupported = 0; + +IF ((ServerProperty(N'EngineEdition') = 3) + AND (((@@microsoftversion / power(2, 24) = 9) + AND (@@microsoftversion & 0xffff >= 3024)) + OR ((@@microsoftversion / power(2, 24) = 10) + AND (@@microsoftversion & 0xffff >= 1600)))) + SELECT @VarDecimalSupported = 1; + +IF (@VarDecimalSupported > 0) + BEGIN + EXECUTE sp_db_vardecimal_storage_format N'$(DatabaseName)', 'ON'; + END + + +GO +PRINT N'Update complete.'; + + +GO diff --git a/schema/SQL_Schema.dacpac b/schema/SQL_Schema.dacpac new file mode 100644 index 0000000..c13085c Binary files /dev/null and b/schema/SQL_Schema.dacpac differ