intelygenz · christianint · Jul 26, 2022 · Aug 1, 2022 · Aug 1, 2022 · Aug 1, 2022
diff --git a/.env b/.env
@@ -0,0 +1 @@
+PYTHONPATH=lab
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,18 @@
 Pipfile
 
-.vscode/
+.vscode/
+.ipynb_checkpoints
+__pycache__
+.idea/
+*.bundle
+*.csv
+*.joblib
+*.kvmodel
+*.npy
+*.pt
+*.png
+*.tgz
+.mypy_cache
+.ropeproject
+.coverage
+*.log
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,17 @@
+FROM python:3.9
+
+ENV PYTHONPATH=lab
+
+WORKDIR /code
+
+COPY ./Pipfile /code/Pipfile
+COPY ./Pipfile.lock /code/Pipfile.lock
+
+RUN python -m pip install --upgrade pip
+RUN pip install pipenv
+RUN pipenv install --clear --system
+
+COPY ./lab/api /code/lab/api
+COPY ./models /code/models
+
+CMD ["./lab/api/launch.sh"]
diff --git a/Pipfile b/Pipfile
@@ -0,0 +1,21 @@
+[[source]]
+url = "https://pypi.python.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+pandas = "==1.4.3"
+matplotlib = "==3.5.2"
+numpy = "==1.23.1"
+scikit-learn = "==1.1.1"
+fastapi = "==0.79.0"
+uvicorn = "==0.18.2"
+
+[dev-packages]
+ipykernel = "==6.15.1"
+seaborn = "==0.11.2"
+pytest = "==7.1.2"
+pylint = "==2.14.5"
+
+[requires]
+python_version = "3.9"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -48,3 +48,24 @@ The key is to ensure the API is easy to use and easy to test. Feel free to archi
 ## Challenge 3 - Dockerize your solution
 
 Nowadays, we can't think of ML solutions in production without thinking about Docker and its benefits in terms of standardisation, scalability and performance. The objective here is to dockerize your API and ensure it is easy to deploy and run in production.
+
+## Solution
+
+In this section I've included comments, reasons and decisions taken about ml-challenge
+
+### Challenge-1
+- I think that is convenient drop the column 'price' if we extract the category from this column (Talk with DS)
+- I've created several config classes. This classes encapsulate config information and, depending of where will be deploy, have to be refactored for read this config file from different origins (environment variables, config files, etc)
+- Amenities has been desglosed in several other columns, but not used in the inference process. I've tried to included these columns but the improve in the model is minimal. Should I include it? (Talk with DS)
+- Could we categorize property_type to? (Talk with DS)
+- I've included the metrics in the final pickle with the model, like dictionary. Maybe with this information could be interesting use some framework like ML Flow
+
+### Challenge-2
+- The api is as simple as possible and for time I haven't included enough testing on this part. These have to be the next steps
+
+### Challenge-3
+- I've only included the dockerfile and script for launch the api, not the code for running the container with this image. The instructions are as follows:
+  ```
+  docker build -t ml-challenge
+  docker run --name ml-challenge-container -p 80:80 ml-challenge
+  ```
diff --git a/lab/api/__init__.py b/lab/api/__init__.py
diff --git a/lab/api/app.py b/lab/api/app.py
@@ -0,0 +1,38 @@
+"""
+This file contains the definition of the api, as simple as possible, to make an inference from model
+"""
+
+from fastapi import FastAPI
+
+from api.inference_engine.inference_engine import InferenceEngine
+from api.models.inference_models import InferenceRequest, InferenceResponse
+from api.settings import AppSettings
+
+settings = AppSettings()
+inference_engine = InferenceEngine(settings=settings)
+app = FastAPI()
+
+
+@app.get("/")
+async def root():
+    """
+    Root endpoint
+
+    Returns:
+        dict: default message for api
+    """
+    return {"message": "Welcome to " + settings.app_name}
+
+
+@app.get("/inference/")
+async def inference(request: InferenceRequest) -> InferenceResponse:
+    """
+    This endpoint recevie an inference request and return the result of them
+
+    Args:
+        request (InferenceRequest): Inference received
+
+    Returns:
+        InferenceResponse: Inference response created
+    """
+    return InferenceResponse(id=request.id, price_category=inference_engine.inference(request))
diff --git a/lab/api/inference_engine/inference_engine.py b/lab/api/inference_engine/inference_engine.py
@@ -0,0 +1,67 @@
+"""
+This file contains all functions related with the inference process
+"""
+import pickle
+
+import numpy as np
+from fastapi import HTTPException
+
+from api.models.inference_models import InferenceRequest
+from api.settings import AppSettings
+
+
+class InferenceEngine:
+    """
+    This class encapsulate the use of the inferences
+    """
+
+    def __init__(self, settings: AppSettings) -> None:
+
+        # Loading model
+        with open(settings.model_path, "rb") as f:
+            pickle_info = pickle.load(f)
+
+        self.__clf = pickle_info[0]
+        self.__settings = settings
+
+    def inference(self, request: InferenceRequest) -> dict:
+        """
+        This method
+
+        Args:
+            request (InferenceRequest): request inference
+
+        Returns:
+            dict: result of the inference
+        """
+        # Create input
+        X = np.array([self.__preprocess_request(request=request)])
+
+        # Predict
+        pred = self.__clf.predict(X)
+        price_category = int(pred[0])
+
+        return self.__settings.mapping_columns["price_category"][price_category]
+
+    def __preprocess_request(self, request: InferenceRequest) -> list:
+        """
+        Extract and preprocesss, from request, the information relevant for the inference
+
+        Args:
+            request (InferenceRequest): Request sended to api
+
+        Returns:
+            list: list of values for inference
+        """
+        try:
+            neighbourhood = self.__settings.mapping_columns["neighbourhood"][request.neighbourhood]
+        except KeyError as key_exc:
+            raise HTTPException(status_code=400, detail="Neighbourhood not valid")
+
+        try:
+            room_type = self.__settings.mapping_columns["room_type"][request.room_type]
+        except KeyError as key_exc:
+            raise HTTPException(status_code=400, detail="Room type not valid")
+
+        room_type = self.__settings.mapping_columns["room_type"][request.room_type]
+        return [neighbourhood, room_type, request.accommodates, request.bathrooms, request.bedrooms]
diff --git a/lab/api/launch.sh b/lab/api/launch.sh
@@ -0,0 +1,2 @@
+#! /bin/bash
+python -m uvicorn api.app:app --host 0.0.0.0 --port 80
diff --git a/lab/api/models/inference_models.py b/lab/api/models/inference_models.py
@@ -0,0 +1,33 @@
+"""
+This file contains all modesl used on inference petition
+"""
+
+from pydantic import BaseModel
+
+
+class InferenceRequest(BaseModel):
+    """
+    Request received for inference endpoint
+    """
+
+    id: int
+    accommodates: int
+    room_type: str
+    beds: int
+    bedrooms: int
+    bathrooms: float
+    neighbourhood: str
+    tv: int
+    elevator: int
+    internet: int
+    latitude: float
+    longitude: float
+
+
+class InferenceResponse(BaseModel):
+    """
+    Response returned for inference endpoint
+    """
+
+    id: int
+    price_category: str
diff --git a/lab/api/settings.py b/lab/api/settings.py
@@ -0,0 +1,25 @@
+"""
+This file has the class definition for app settings
+"""
+from pydantic import BaseSettings
+
+
+class AppSettings(BaseSettings):
+    app_name: str = "Inference API"
+    model_path: str = "models/random_forest_classifier_2022-08-04 08:31:07.734769.pkl"
+    mapping_columns = {
+        "room_type": {
+            "Shared room": 1,
+            "Private room": 2,
+            "Entire home/apt": 3,
+            "Hotel room": 4,
+        },
+        "neighbourhood": {
+            "Bronx": 1,
+            "Queens": 2,
+            "Staten Island": 3,
+            "Brooklyn": 4,
+            "Manhattan": 5,
+        },
+        "price_category": {0: "Low", 1: "Mid", 2: "High", 3: "Lux"},
+    }
diff --git a/lab/processes/config.py b/lab/processes/config.py
@@ -0,0 +1,126 @@
+"""
+This file contain the class that encapsulate config for preprocess
+"""
+
+import numpy as np
+
+
+class DataRawColumns:
+    """
+    This class has all names of columns for the raw dataframe
+    """
+
+    ID = "id"
+    BATHROOMS = "bathrooms"
+    BATHROOMS_TEXT = "bathrooms_text"
+    NEIGHBOURHOOD_GROUP_CLEANSED = "neighbourhood_group_cleansed"
+    PROPERTY_TYPE = "property_type"
+    ROOM_TYPE = "room_type"
+    LATITUDE = "latitude"
+    LONGITUDE = "longitude"
+    ACCOMMODATES = "accommodates"
+    BEDROOMS = "bedrooms"
+    BEDS = "beds"
+    AMENITIES = "amenities"
+    PRICE = "price"
+
+    SUBSET_TRAINING = [
+        ID,
+        BATHROOMS,
+        NEIGHBOURHOOD_GROUP_CLEANSED,
+        PROPERTY_TYPE,
+        ROOM_TYPE,
+        LATITUDE,
+        LONGITUDE,
+        ACCOMMODATES,
+        BEDROOMS,
+        BEDS,
+        AMENITIES,
+        PRICE,
+    ]
+
+
+class DataPreprocessColumns:
+    """
+    This class has all names of columns for the preprocess dataframe
+    """
+
+    ID = "id"
+    NEIGHBOURHOOD = "neighbourhood"
+    PROPERTY_TYPE = "property_type"
+    ROOM_TYPE = "room_type"
+    LATITUDE = "latitude"
+    LONGITUDE = "longitude"
+    ACCOMMODATES = "accommodates"
+    BATHROOMS = "bathrooms"
+    BEDROOMS = "bedrooms"
+    BEDS = "beds"
+    PRICE = "price"
+    CATEGORY = "category"
+    TV = "TV"
+    INTERNET = "Internet"
+    AIR_CONDITIONING = "Air_conditioning"
+    KITCHEN = "Kitchen"
+    HEATING = "Heating"
+    WIFI = "Wifi"
+    ELEVATOR = "Elevator"
+    BREAKFAST = "Breakfast"
+
+
+class ConfigPreprocess:
+    """
+    This class encapsulate the config for preprocess
+    """
+
+    # Paths
+    RAW_FILE = "data/raw/listings.csv"
+    PREPROCESS_FILE = "data/processed/new_processed_listings.csv"
+
+    # Preprocess config
+    MIN_PRICE = 10
+    BINS_PRICE = [10, 90, 180, 400, np.inf]
+    LABELS_PRICE = [0, 1, 2, 3]
+    MAPING_COLUMNS = {
+        DataPreprocessColumns.ROOM_TYPE: {
+            "Shared room": 1,
+            "Private room": 2,
+            "Entire home/apt": 3,
+            "Hotel room": 4,
+        },
+        DataPreprocessColumns.NEIGHBOURHOOD: {
+            "Bronx": 1,
+            "Queens": 2,
+            "Staten Island": 3,
+            "Brooklyn": 4,
+            "Manhattan": 5,
+        },
+    }
+
+
+class ConfigTrain:
+    """
+    This class encapsulate the config for train process
+    """
+
+    # Features info
+    FEATURE_NAMES = [
+        DataPreprocessColumns.NEIGHBOURHOOD,
+        DataPreprocessColumns.ROOM_TYPE,
+        DataPreprocessColumns.ACCOMMODATES,
+        DataPreprocessColumns.BATHROOMS,
+        DataPreprocessColumns.BEDROOMS,
+    ]
+    FEATURE_CATEGORY = DataPreprocessColumns.CATEGORY
+
+    # Split parameters
+    TEST_SIZE = 0.15
+    RANDOM_STATE_SPLIT = 1
+
+    # Train parameters
+    N_ESTIMATORS = 500
+    RANDOM_STATE_TRAIN = 0
+    CLASS_WEIGHT = "balanced"
+    N_JOBS = 4
+
+    # Paths
+    FOLDER_PATH = "models/"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#! /bin/bash
		python -m uvicorn api.app:app --host 0.0.0.0 --port 80
Copy link Contributor mastertilla Aug 10, 2022 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Más que un cambio es que puedes lanzar este mismo comando desde Docker, por evitar más dependencias de código.