Skip to content

Commit

Permalink
cleanup main folder
Browse files Browse the repository at this point in the history
  • Loading branch information
rmusser01 committed May 1, 2024
1 parent 7249b9a commit 9ae51b0
Show file tree
Hide file tree
Showing 9 changed files with 698 additions and 0 deletions.
61 changes: 61 additions & 0 deletions tldw-scripts/chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env python3
import string
import json
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('hf-internal-testing/llama-tokenizer', use_fast = True)

def segment_merger(filename, max_text_len = 1000):
segments = json.load(open(filename))

text = ''
last_segment = { 'speaker': None }
start_time = None
stop_chars = string.punctuation.replace(',','')

for segment in segments:
early_break = (max_text_len > 0) and (len(text) > max_text_len) and (text[-1] in stop_chars)
if last_segment['speaker'] != segment['speaker'] or early_break:
if text != '':
yield { 'speaker': last_segment['speaker'], 'text': text, 'start': start_time, 'end': last_segment['end'] }
text = segment['text'].lstrip()
start_time = segment['start']
else:
text += segment['text']
last_segment = segment

if text != '':
yield { 'speaker': last_segment['speaker'], 'text': text, 'start': start_time, 'end': last_segment['end'] }

def time_splitter(merged_segments, chunk_size = 300):
start_time = None
text = ''
speakers = []

for segment in merged_segments:
if start_time is None:
start_time = segment['start']
if not segment['speaker'] in speakers: speakers.append(segment['speaker'])
text += f"{segment['speaker']}: {segment['text']}\n"
if segment['end'] - start_time >= chunk_size:
yield { 'text': text, 'start': start_time, 'end': segment['end'], 'speakers': speakers }
start_time = None
text = ''
speakers = []

def main(prefix: str, chunk_size: int = 300, max_text_len: int = 800):
merged_segments = list(segment_merger(prefix+'.diarize.json', max_text_len))
split_segments = list(time_splitter(merged_segments, chunk_size))
max_tokens = 0
with open(prefix+'.chunk.json', 'w') as f:
json.dump(split_segments, f)
for idx, segment in enumerate(split_segments):
logits = tokenizer.encode(segment['text'])
if len(logits) > max_tokens: max_tokens = len(logits)
print(f"Segment {idx}: {len(logits)} tokens, {len(segment['text'])} characters, {int(segment['end']-segment['start'])} seconds")

print(f"Largest chunk was {max_tokens} tokens")
print(f"Wrote {len(split_segments)} chunks to {prefix}.chunk.json")

if __name__ == "__main__":
import fire
fire.Fire(main)
57 changes: 57 additions & 0 deletions tldw-scripts/compare-app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import json
import streamlit as st
import glob

def load_analysis_file(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
return data

def display_analysis_data(data):
tests = data['tests']
models_list = data['models']
models = {}
for idx, model_info in enumerate(models_list):
models[model_info['id']] = model_info

# summary table
summary_cols = st.columns(len(models_list))
for model_id, model_info in models.items():
with summary_cols[model_info['idx']]:
st.subheader(f"{model_info['short_name']}")

for test_name, test_data in tests.items():
st.markdown(f"#### {test_name}")

columns = st.columns(len(models))
if 'summary' in test_data:
st.markdown("**Analysis**: "+test_data['summary'])

for model_id, model_result in test_data['results'].items():
model_info = models[model_id]

model_result['passing_tests'] = '\n\n'.join([f":blue[{x}]" for x in model_result['passing_tests'].split('\n') if x.strip() != ''])
model_result['failing_tests'] = '\n\n'.join([f":red[{x}]" for x in model_result['failing_tests'].split('\n') if x.strip() != ''])

with columns[model_info['idx']]:
#st.subheader(f"{model_info['short_name']}")
st.write(model_result['answer'])

st.set_page_config(page_title='Analysis Explorer', layout="wide")
st.markdown("""
<style>
.block-container {
padding-top: 2rem;
padding-bottom: 0rem;
padding-left: 3rem;
padding-right: 3.5rem;
}
</style>
""", unsafe_allow_html=True)

files = sorted(glob.glob('compare/*.json'))
data = [json.load(open(file,'r')) for file in files]
titles = [x['config']['title'] for x in data]
options = st.selectbox('Select Summary', titles)
idx = titles.index(options)
display_analysis_data(data[idx])
69 changes: 69 additions & 0 deletions tldw-scripts/compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python3
import json
import os
from jinja2 import Template
import fire
import yaml
from copy import copy

def prepare(TEST_LANGUAGE, path, files):
out = {}
models = []

for idx, info in enumerate(files):
file = os.path.join(path, info['eval'])
id = info['id']

tags = os.path.basename(file).replace('.ndjson', '').split('_')
prompt = tags[3]
params = tags[5]
model = tags[6]

models.append({'prompt': prompt, 'short_name': info['short_name'], 'params': params, 'model': model, 'id': id, 'idx': idx, 'passed': 0, 'total': 0})
results = [json.loads(line) for line in open(file)]

for r in results:
if r['language'] != TEST_LANGUAGE:
continue

testid = r['name']+'-'+r['language']
if testid not in out:
out[testid] = { 'results': {}, 'task': '', 'language': r['language'] }

check_summary = ''
passing_tests = ''
failing_tests = ''

out[testid]['results'][id] = {
'check_summary': check_summary,
'passing_tests': passing_tests,
'failing_tests': failing_tests,
#'code': r['code'],
'answer': r['answer']
}

#models[idx]['passed'] += r['passed']
#models[idx]['total'] += r['total']

return { 'tests': out, 'models': models }

def main(config: str, path: str = "./", analyser: str = "", language: str = "english"):
cfg = yaml.safe_load(open(config))

for lang in language.split(','):
cfg['language'] = lang
print('Comparing results for', lang)
data = prepare(cfg['language'], path, cfg['models'])
data['config'] = copy(cfg)
data['config']['title'] += f" ({lang})"
data['analyser'] = analyser

if analyser != "":
analysis(data, analyser)

outfile = config.replace('.yaml', f'-{lang}.json')
with open(outfile, 'w') as f:
json.dump(data, f, indent=4)

if __name__ == "__main__":
fire.Fire(main)
29 changes: 29 additions & 0 deletions tldw-scripts/merger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import json
import sys

in_file = sys.argv[1]
with open(in_file) as infile:
chunks = [json.loads(line) for line in infile.readlines()]

def part_to_time(part):
mins = part*5
oh = mins // 60
om = mins % 60
return f'{oh:02}:{om:02}'

text = ''
for idx, chunk in enumerate(chunks):
#text += f'\n\n[{part_to_time(idx)} - {part_to_time(idx+1)}] '
text += f'\nSection {idx+1}: {chunk["answer"]}\n'

out_file = in_file.replace('ndjson','txt')
with open(out_file,'w') as outfile:
outfile.write(text)

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('hf-internal-testing/llama-tokenizer', use_fast = True)
logits = tokenizer.encode(text)

print('chunks:', len(chunks))
print('summary bytes:', len(text))
print('summary tokens:', len(logits))
10 changes: 10 additions & 0 deletions tldw-scripts/pyannote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from pyannote.audio import Pipeline
import torch
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization").to(torch.device("cuda"))

# 4. apply pretrained pipeline
diarization = pipeline("lex.wav", num_speakers=2)

# 5. print the result
for turn, _, speaker in diarization.itertracks(yield_label=True):
print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
86 changes: 86 additions & 0 deletions tldw-scripts/roller-chatgpt-v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env python3
from jinja2 import Template
import json

prompt_template = """
Continue the rolling transcription summary of "{{title}}". Consider the current context when summarizing the given transcription part.
### Context: {{ context }}
Speaker-Map: {{ speakermap }}
### Transcription part {{ idx }} of {{ len }}, start time {{ start }}:
{{ chunk }}
### Instruction: Using the Context above, analyze the Trasncription and respond with a JSON object in this form:
{
"Speaker-Map": { "SPEAKER 1": "Bob Dole", "SPEAKER 2": "Jane Doe" } // A map of speakers to their names, make sure to remember all previous speakers.
"Next-Context": "..." // An updated context for the next part of the transcription. Always include the speakers and the current topics of discussion.
"Summary": "..." // A detailed, point-by-point summary of the current transcription.
}
"""

from openai import OpenAI

client = OpenAI()

def main(prefix: str, init_speakers: str = ""):
the_template = Template(prompt_template)

split_segments = json.load(open(prefix+'.chunk.json'))
info = json.load(open(prefix+'.info.json'))

context = f"""
Video Title: {info['title']}
Video Description: {info['description'][:1024]}
"""

speakers = "{ UNKNOWN }"

f = open(prefix+'.summary.json', 'w')
idx = 0
for chunk in split_segments:
dur = chunk['end'] - chunk['start']
print(f"{idx}: {dur}s {len(chunk)}")

prompt = the_template.render(chunk=chunk['text'], start=chunk['start'], end=chunk['end'],
idx=idx, len=len(split_segments), context=context, speakermap=speakers, title=info['title'])

messages = [{'role': 'user', 'content': prompt }]
response = client.chat.completions.create(messages=messages,model='gpt-3.5-turbo-1106',temperature=0.1,max_tokens=1024, response_format={ "type": "json_object" })

answer = response.choices[0].message.content

parsed = json.loads(answer)

summary = parsed.get('Summary','')
new_speakers = parsed.get('Speaker-Map','')
new_context = parsed.get('Next-Context','')

if summary == '' or new_context == '' or new_speakers == '':
print('extraction failed:', new_context, new_speakers, summary)
exit(1)
else:
section = {
'start': chunk['start'],
'end': chunk['end'],
'summary': summary,
'speakers': new_speakers,
'context': new_context
}
print('## ', new_speakers)
print('>> ', new_context)
print(summary)
print()

f.write(json.dumps(section)+'\n')
f.flush()

context = new_context
speakers = new_speakers

idx = idx + 1

if __name__ == "__main__":
import fire
fire.Fire(main)
Loading

0 comments on commit 9ae51b0

Please sign in to comment.