forked from itsmetamike/vtt-summarizer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummarizer.py
212 lines (174 loc) · 8.25 KB
/
summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import os
from datetime import datetime
import json
import webvtt
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.chat_models import ChatOllama
from langchain_core.exceptions import OutputParserException
# Define the data structure for the meeting notes using Pydantic
class MeetingNotes(BaseModel):
meeting_overview: list = Field(description="Overview of the meeting")
participants: list = Field(description="Participants in the meeting")
main_topics: list = Field(description="Main topics discussed in the meeting (5-10 topics)")
key_points_decisions: list = Field(description="Key points and decisions made in the meeting (25-50 points)")
action_items: list = Field(description="Action items from the meeting (5-10 items)")
# Set up the prompt template for initial extraction
initial_prompt_template = """
Please extract the following information from the meeting transcript and format it as a JSON object:
{format_instructions}
Transcript:
{transcript}
Your response must be a single, well-formed JSON object with keys: meeting_overview, participants, main_topics, key_points_decisions, and action_items.
- For "meeting_overview", provide a list of strings summarizing the meeting.
- For "participants", provide a list of participant names as strings.
- For "main_topics", provide a list of a minimum of 5 to 10 strings representing the topics discussed in detail.
- For "key_points_decisions", provide a list of a minimum of 25 to 50 objects, each containing "decision" and "impact" keys with string values.
- For "action_items", provide a list of a minimum of 5 to 10 objects, each containing "item", "assigned_to", and "description" keys with string values.
Ensure that the response is valid JSON and includes detailed information for each key. Do not include any comments or additional text outside the JSON object. Do not omit items for brevity.
"""
# Set up the prompt template for JSON cleanup
cleanup_prompt_template = """
Please clean and validate the following JSON. Ensure it is well-formed and remove any unnecessary text or errors. Only return the JSON object.
Raw JSON:
{raw_json}
Cleaned JSON:
"""
# Initialize the LLM models
local_llm = 'phi3'
cleanup_llm = 'phi3' # You can use the same or a different model for cleanup
print(f"Initializing LLM for extraction: {local_llm}")
model = ChatOllama(model=local_llm, temperature=0.1)
print(f"Initializing LLM for cleanup: {cleanup_llm}")
cleanup_model = ChatOllama(model=cleanup_llm, temperature=0)
def clean_json_output(raw_content):
# Strip leading and trailing whitespace
raw_content = raw_content.strip()
# Remove code block markers if present
if raw_content.startswith("```json"):
raw_content = raw_content[7:]
if raw_content.endswith("```"):
raw_content = raw_content[:-3]
# Split lines and remove comments (lines starting with `//`)
lines = raw_content.split("\n")
json_lines = [line for line in lines if not line.strip().startswith("//")]
# Clean lines and ensure correct placement of commas
cleaned_lines = []
inside_array = False
for line in json_lines:
stripped_line = line.strip()
if stripped_line.startswith("["):
inside_array = True
if inside_array and stripped_line.startswith("]"):
inside_array = False
if not inside_array:
cleaned_lines.append(line)
else:
if line.endswith(","):
cleaned_lines.append(line)
else:
cleaned_lines.append(line + ",")
# Rejoin the cleaned lines, removing any final commas
cleaned_json = "\n".join(cleaned_lines).replace(",]", "]").replace(",}", "}")
return cleaned_json
def escape_curly_braces(text):
return text.replace("{", "{{").replace("}", "}}")
def extract_meeting_notes(transcript):
# Create the JSON output parser
parser = JsonOutputParser(pydantic_object=MeetingNotes)
# Get the format instructions and escape curly braces
format_instructions = parser.get_format_instructions().replace("{", "{{").replace("}", "}}")
# Create the initial prompt template
prompt = PromptTemplate(
template=initial_prompt_template,
input_variables=["transcript"],
partial_variables={"format_instructions": format_instructions},
)
# Format the initial prompt
query = prompt.format(transcript=transcript)
print("Query:", query)
# Invoke the LLM to get the initial response
response = model.invoke(query)
raw_content = response.content
print("Initial Response:", raw_content)
# Clean the JSON output using another LLM
cleanup_prompt = cleanup_prompt_template.format(raw_json=raw_content)
cleanup_response = cleanup_model.invoke(cleanup_prompt)
cleaned_raw_content = cleanup_response.content
print("Cleaned Raw Content:", cleaned_raw_content)
# Further clean the JSON output
cleaned_json = clean_json_output(cleaned_raw_content)
print("Final Cleaned JSON:", cleaned_json)
# Parse the response
try:
result = parser.parse(cleaned_json)
if isinstance(result, dict):
# Ensure all required fields are present
for key in MeetingNotes.__fields__.keys():
if key not in result:
result[key] = []
# Convert JSON strings back to lists/dictionaries
for key, value in result.items():
if isinstance(value, str):
try:
result[key] = json.loads(value)
except json.JSONDecodeError:
pass
result = MeetingNotes(**result)
except (json.JSONDecodeError, KeyError, TypeError, OutputParserException) as e:
print(f"Error parsing JSON response: {e}")
result = None
return result
def read_vtt_file(file_path):
print("Reading transcript from file:", file_path)
vtt = webvtt.read(file_path)
transcript = " ".join([caption.text for caption in vtt])
print("Transcript length:", len(transcript))
return transcript
def save_json_to_file(data, file_path):
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, 'w') as f:
json.dump(data, f, indent=2)
print(f"Extracted information saved to {file_path}")
def save_md_to_file(data, file_path):
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, 'w') as f:
f.write(f"# Meeting Overview\n")
for overview in data['meeting_overview']:
f.write(f"- {overview}\n")
f.write(f"\n## Participants\n")
for participant in data['participants']:
f.write(f"- {participant}\n")
f.write(f"\n## Main Topics\n")
for topic in data['main_topics']:
f.write(f"- {topic}\n")
f.write(f"\n## Key Points and Decisions\n")
for point in data['key_points_decisions']:
f.write(f"- **Decision**: {point['decision']}\n **Impact**: {point['impact']}\n")
f.write(f"\n## Action Items\n")
for item in data['action_items']:
f.write(f"- **Item**: {item['item']}\n **Assigned to**: {item['assigned_to']}\n **Description**: {item['description']}\n")
print(f"Markdown file saved to {file_path}")
def main():
file_path = r"../dfg-transcripts/2024/1Q24/MSF-Digital-Fashion-Wearables-for-Avatars-2024-01-08_16h02_audio_transcript.vtt"
if not os.path.exists(file_path):
print(f"File not found: {file_path}")
return
transcript = read_vtt_file(file_path)
result = extract_meeting_notes(transcript)
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
base_name = os.path.basename(file_path).replace(".vtt", "")
json_file_path = f'./json/summary_{base_name}_{timestamp}.json'
md_file_path = f'./md/summary_{base_name}_{timestamp}.md'
print("\nExtracted Information:")
if result:
result_dict = result.dict()
print(json.dumps(result_dict, indent=2))
save_json_to_file(result_dict, json_file_path)
save_md_to_file(result_dict, md_file_path)
else:
print("No valid information extracted.")
print("Information extraction completed.")
if __name__ == "__main__":
main()