SomeOddCodeGuy · bgs4free · Jan 17, 2025
diff --git a/README.md b/README.md
@@ -54,6 +54,14 @@ and where.
         ```
 
     - For Linux or MacOS:
+        ```sh
+        ./run_linux.sh
+        ```
+        Or with custom directory for the wiki data (parent of wiki-dataset and txtai-wikipedia):
+        ```sh
+        ./run_linux.sh --database_dir path/to/datadirs
+        ```
+      - The script was tested on Linux and it might work on MacOS.
       - There are currently scripts within "Untested", though there is a known issue for MacOS related to git. A workaround 
         is presented in the README for that folder.
 
@@ -131,7 +139,7 @@ The API configuration is managed through the `config.json` file:
 ```
 
 The "verbose" is for changing whether the API library uvicorn outputs all logs vs just warning logs. Set to 
-warning by default. 
+warning by default.
 
 ## Endpoints
 

diff --git a/Untested/README.md b/Untested/README.md
@@ -7,12 +7,6 @@ run them.
 
 Current Contents:
 ------------------------
-**run_linux.sh** - This is an AI translated linux version of the run_windows.bat file. 
-I do not have a linux system to test with, so I can't be sure it'll work. If you
-have a linux system and want to give it a try, please let me know how it goes. I
-do urge you, however, to take caution and look over the file first to ensure that
-it doesn't do anything that you aren't comfortable with it doing on your system.
-
 **run_macos.sh** This I have run on my own Mac Studio and it worked EXCEPT for the
 git clone call. I need to find a workaround for that. More information in the "KNOWN ISSUE"
 section below.

diff --git a/Untested/run_linux.sh b/Untested/run_linux.sh
diff --git a/run_linux.sh b/run_linux.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# Stop script on any error code and trap errors for easier debugging
+set -eE
+trap 'echo >&2 "Error - exited with status $? at line $LINENO"' ERR
+
+# Step 0: Parse any arguments we care about
+DATABASE_DIR="."
+WIKI_DATA_SET_DIR="$DATA_DIR/wiki-dataset"
+TXTAI_WIKIPEDIA_DIR="$DATA_DIR/txtai-wikipedia"
+OTHER_ARGS=()
+
+function help() {
+    echo "usage: $0 [-h] [-d DATABASE_DIR]"
+    echo
+    echo "Offline Wikipedia Text API"
+    echo
+    echo "options:"
+    echo "-h, --help            show this help message and exit"
+    echo "-d DATABASE_DIR, --database_dir DATABASE_DIR"
+    echo "                      Base directory containing the wiki-dataset and txtai-wikipedia"
+    echo "                      folders."
+}
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --database_dir|-d)
+      DATABASE_DIR="$2"
+      shift 2
+      ;;
+    --help|-h)
+      help
+      exit 0
+      ;;
+    *)
+      # For any unrecognized args, store them to pass through
+      OTHER_ARGS+=("$1")
+      shift
+      ;;
+  esac
+done
+
+
+# Step A: Create and activate a Python virtual environment
+echo Creating virtual environment
+if [ ! -f "venv" ]; then
+    python -m venv venv
+else
+    echo Existing venv detected. Activating.
+fi
+
+echo Activating virtual environment
+source venv/bin/activate
+
+# Step B: Install requirements from requirements.txt
+echo ---------------------------------------------------------------
+echo Installing python requirements from requirements.txt
+pip install -r requirements.txt
+
+# Step C: Clone the git repository for full wiki articles into a directory called "wiki-dataset"
+echo ---------------------------------------------------------------
+echo Downloading Wikipedia dataset. As of 2024-11-14, this is about 44GB
+if [ ! -d "$WIKI_DATA_SET_DIR" ]; then
+    git clone https://huggingface.co/datasets/NeuML/wikipedia-20240901 "$WIKI_DATA_SET_DIR"
+else
+    echo Existing wiki-dataset directory detected.
+fi
+
+# Step D: Clone the git repository for txtai wiki summaries into a directory called txtai-wikipedia
+echo ---------------------------------------------------------------
+echo Downloading txtai-wikipedia dataset. As of 2024-11-14, this is about 15GB.
+if [ ! -d "$TXTAI_WIKIPEDIA_DIR" ]; then
+    git clone https://huggingface.co/NeuML/txtai-wikipedia "$TXTAI_WIKIPEDIA_DIR"
+else
+    echo Existing txtai-wikipedia directory detected.
+fi
+
+# Finally: Start the API
+echo ---------------------------------------------------------------
+echo Starting API. If this is the first run, setup may take 10-15 minutes depending on your machine.
+echo Setup time is due to indexing wikipedia article titles into a json file for API speed.
+echo ---------------------------------------------------------------
+echo API Starting...
+python start_api.py --database_dir "$DATABASE_DIR" "${OTHER_ARGS[@]}"
diff --git a/start_api.py b/start_api.py
@@ -1,3 +1,4 @@
+import argparse
 import os
 import json
 from typing import List, Dict
@@ -15,8 +16,18 @@
 # Correcting an issue in Windows
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 
-WIKI_DATASET_DIR = os.path.join("wiki-dataset", "train")
-TXT_AI_DIR = "txtai-wikipedia"
+parser = argparse.ArgumentParser(description="Offline Wikipedia Text API")
+parser.add_argument(
+    "-d",
+    "--database_dir",
+    default=".",
+    help="Base directory containing the wiki-dataset and txtai-wikipedia folders."
+)
+args = parser.parse_args()
+
+DATABASE_DIR = args.database_dir
+WIKI_DATASET_DIR = os.path.join(DATABASE_DIR, "wiki-dataset", "train")
+TXT_AI_DIR = os.path.join(DATABASE_DIR, "txtai-wikipedia")
 DICTIONARY_FILE = "title_to_index.json"
 CONFIG_FILE = "config.json"