chat gpt fine turn large dataset

leodeveloper · leodeveloper · commit 0db3c7281b06 · 2024-07-06T12:29:32.000+04:00
diff --git a/.gitignore b/.gitignore
@@ -153,6 +153,8 @@ dmypy.json
 
 # Cython debug symbols
 cython_debug/
+envfinetune/
+*.jsonl
 
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
diff --git a/mongodb_extract_data.ipynb b/mongodb_extract_data.ipynb
@@ -0,0 +1,125 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ok\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('ok')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from pymongo import MongoClient\n",
+    "from IPython.display import Markdown\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cloudmongo = os.environ.get(\"cloudmongodb\")\n",
+    "print(cloudmongo)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_mongo_collection(db_name, collection_name, uri=cloudmongo):\n",
+    "    client = MongoClient(uri)\n",
+    "    db = client[db_name]\n",
+    "    return db[collection_name]\n",
+    "\n",
+    "def create_text_structure(collection):\n",
+    "    lines = []\n",
+    "\n",
+    "    # Use a regular expression to match titles containing \"Let's Go to\"\n",
+    "    query = {\"title\": {\"$regex\": \"Let's Go to\"}}\n",
+    "    for doc in collection.find(query):\n",
+    "        line = f'{{\"messages\": [{{\"role\": \"system\", \"content\": \"Text Generator\"}}, {{\"role\": \"user\", \"content\": \"{doc.get(\"title\", \"\")}\"}}, {{\"role\": \"assistant\", \"content\": \"{doc.get(\"transcript\", \"\")}\"}}]}}'\n",
+    "        lines.append(line)\n",
+    "\n",
+    "    return lines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Replace 'your_db_name' and 'your_collection_name' with your actual database and collection names\n",
+    "collection = get_mongo_collection('Youtube', 'ZeeshanUsmaniYouTube')\n",
+    "lines = create_text_structure(collection)\n",
+    "    \n",
+    "    # Save to a text file with UTF-8 encoding\n",
+    "with open(\"zeeshanusmani.jsonl\", \"w\", encoding=\"utf-8\") as outfile:\n",
+    "    for line in lines:\n",
+    "        outfile.write(line + '\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+pymongo
+python_dotenv