Skip to content

Commit 0db3c72

Browse files
committed
chat gpt fine turn large dataset
1 parent 81eef9d commit 0db3c72

File tree

3 files changed

+129
-0
lines changed

3 files changed

+129
-0
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,8 @@ dmypy.json
153153

154154
# Cython debug symbols
155155
cython_debug/
156+
envfinetune/
157+
*.jsonl
156158

157159
# PyCharm
158160
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can

mongodb_extract_data.ipynb

+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"name": "stdout",
10+
"output_type": "stream",
11+
"text": [
12+
"ok\n"
13+
]
14+
}
15+
],
16+
"source": [
17+
"print('ok')"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": 4,
23+
"metadata": {},
24+
"outputs": [
25+
{
26+
"data": {
27+
"text/plain": [
28+
"True"
29+
]
30+
},
31+
"execution_count": 4,
32+
"metadata": {},
33+
"output_type": "execute_result"
34+
}
35+
],
36+
"source": [
37+
"import os\n",
38+
"import json\n",
39+
"from pymongo import MongoClient\n",
40+
"from IPython.display import Markdown\n",
41+
"import nest_asyncio\n",
42+
"nest_asyncio.apply()\n",
43+
"from dotenv import load_dotenv\n",
44+
"load_dotenv()"
45+
]
46+
},
47+
{
48+
"cell_type": "code",
49+
"execution_count": null,
50+
"metadata": {},
51+
"outputs": [],
52+
"source": [
53+
"cloudmongo = os.environ.get(\"cloudmongodb\")\n",
54+
"print(cloudmongo)"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": 15,
60+
"metadata": {},
61+
"outputs": [],
62+
"source": [
63+
"def get_mongo_collection(db_name, collection_name, uri=cloudmongo):\n",
64+
" client = MongoClient(uri)\n",
65+
" db = client[db_name]\n",
66+
" return db[collection_name]\n",
67+
"\n",
68+
"def create_text_structure(collection):\n",
69+
" lines = []\n",
70+
"\n",
71+
" # Use a regular expression to match titles containing \"Let's Go to\"\n",
72+
" query = {\"title\": {\"$regex\": \"Let's Go to\"}}\n",
73+
" for doc in collection.find(query):\n",
74+
" line = f'{{\"messages\": [{{\"role\": \"system\", \"content\": \"Text Generator\"}}, {{\"role\": \"user\", \"content\": \"{doc.get(\"title\", \"\")}\"}}, {{\"role\": \"assistant\", \"content\": \"{doc.get(\"transcript\", \"\")}\"}}]}}'\n",
75+
" lines.append(line)\n",
76+
"\n",
77+
" return lines"
78+
]
79+
},
80+
{
81+
"cell_type": "code",
82+
"execution_count": 16,
83+
"metadata": {},
84+
"outputs": [],
85+
"source": [
86+
"# Replace 'your_db_name' and 'your_collection_name' with your actual database and collection names\n",
87+
"collection = get_mongo_collection('Youtube', 'ZeeshanUsmaniYouTube')\n",
88+
"lines = create_text_structure(collection)\n",
89+
" \n",
90+
" # Save to a text file with UTF-8 encoding\n",
91+
"with open(\"zeeshanusmani.jsonl\", \"w\", encoding=\"utf-8\") as outfile:\n",
92+
" for line in lines:\n",
93+
" outfile.write(line + '\\n')"
94+
]
95+
},
96+
{
97+
"cell_type": "code",
98+
"execution_count": null,
99+
"metadata": {},
100+
"outputs": [],
101+
"source": []
102+
}
103+
],
104+
"metadata": {
105+
"kernelspec": {
106+
"display_name": "Python 3",
107+
"language": "python",
108+
"name": "python3"
109+
},
110+
"language_info": {
111+
"codemirror_mode": {
112+
"name": "ipython",
113+
"version": 3
114+
},
115+
"file_extension": ".py",
116+
"mimetype": "text/x-python",
117+
"name": "python",
118+
"nbconvert_exporter": "python",
119+
"pygments_lexer": "ipython3",
120+
"version": "3.11.8"
121+
}
122+
},
123+
"nbformat": 4,
124+
"nbformat_minor": 2
125+
}

requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
pymongo
2+
python_dotenv

0 commit comments

Comments
 (0)