cleanup main folder

rmusser01 · May 1, 2024 · 9ae51b0 · 9ae51b0
1 parent 7249b9a
commit 9ae51b0
Show file tree

Hide file tree

Showing 9 changed files with 698 additions and 0 deletions.
diff --git a/tldw-scripts/chunker.py b/tldw-scripts/chunker.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+import string
+import json
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained('hf-internal-testing/llama-tokenizer', use_fast = True)
+
+def segment_merger(filename, max_text_len = 1000):
+    segments = json.load(open(filename))
+
+    text = ''
+    last_segment = { 'speaker': None }
+    start_time = None
+    stop_chars = string.punctuation.replace(',','')
+
+    for segment in segments:
+        early_break = (max_text_len > 0) and (len(text) > max_text_len) and (text[-1] in stop_chars)
+        if last_segment['speaker'] != segment['speaker'] or early_break:
+            if text != '':
+                yield { 'speaker': last_segment['speaker'], 'text': text, 'start': start_time, 'end': last_segment['end'] }
+            text = segment['text'].lstrip()
+            start_time = segment['start']
+        else:
+            text += segment['text']
+        last_segment = segment
+
+    if text != '':
+        yield { 'speaker': last_segment['speaker'], 'text': text, 'start': start_time, 'end': last_segment['end'] }
+
+def time_splitter(merged_segments, chunk_size = 300):
+    start_time = None
+    text = ''
+    speakers = []
+
+    for segment in merged_segments:
+        if start_time is None:
+            start_time = segment['start']
+        if not segment['speaker'] in speakers: speakers.append(segment['speaker'])
+        text += f"{segment['speaker']}: {segment['text']}\n"
+        if segment['end'] - start_time >= chunk_size:
+            yield { 'text': text, 'start': start_time, 'end': segment['end'], 'speakers': speakers }
+            start_time = None
+            text = ''
+            speakers = []
+
+def main(prefix: str, chunk_size: int = 300, max_text_len: int = 800):
+    merged_segments = list(segment_merger(prefix+'.diarize.json', max_text_len))
+    split_segments = list(time_splitter(merged_segments, chunk_size))
+    max_tokens = 0
+    with open(prefix+'.chunk.json', 'w') as f:
+        json.dump(split_segments, f)
+    for idx, segment in enumerate(split_segments):
+        logits = tokenizer.encode(segment['text'])
+        if len(logits) > max_tokens: max_tokens = len(logits)
+        print(f"Segment {idx}: {len(logits)} tokens, {len(segment['text'])} characters, {int(segment['end']-segment['start'])} seconds")
+
+    print(f"Largest chunk was {max_tokens} tokens")
+    print(f"Wrote {len(split_segments)} chunks to {prefix}.chunk.json")
+
+if __name__ == "__main__":
+    import fire
+    fire.Fire(main)
diff --git a/tldw-scripts/compare-app.py b/tldw-scripts/compare-app.py
@@ -0,0 +1,57 @@
+import json
+import streamlit as st
+import glob
+
+def load_analysis_file(file_path):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+    return data
+
+def display_analysis_data(data):
+    tests = data['tests']
+    models_list = data['models']
+    models = {}
+    for idx, model_info in enumerate(models_list):
+        models[model_info['id']] = model_info
+
+    # summary table
+    summary_cols = st.columns(len(models_list))
+    for model_id, model_info in models.items():
+        with summary_cols[model_info['idx']]:
+            st.subheader(f"{model_info['short_name']}")
+
+    for test_name, test_data in tests.items():
+        st.markdown(f"#### {test_name}")
+
+        columns = st.columns(len(models))
+        if 'summary' in test_data:
+            st.markdown("**Analysis**: "+test_data['summary'])
+
+        for model_id, model_result in test_data['results'].items():
+            model_info = models[model_id]
+
+            model_result['passing_tests'] = '\n\n'.join([f":blue[{x}]" for x in model_result['passing_tests'].split('\n') if x.strip() != ''])
+            model_result['failing_tests'] = '\n\n'.join([f":red[{x}]" for x in model_result['failing_tests'].split('\n') if x.strip() != ''])
+
+            with columns[model_info['idx']]:
+                #st.subheader(f"{model_info['short_name']}")
+                st.write(model_result['answer'])
+
+st.set_page_config(page_title='Analysis Explorer', layout="wide")
+st.markdown("""
+        <style>
+            .block-container {
+                    padding-top: 2rem;
+                    padding-bottom: 0rem;
+                    padding-left: 3rem;
+                    padding-right: 3.5rem;
+                }
+        </style>
+        """, unsafe_allow_html=True)
+
+files = sorted(glob.glob('compare/*.json'))
+data = [json.load(open(file,'r')) for file in files]
+titles = [x['config']['title'] for x in data]
+options = st.selectbox('Select Summary', titles)
+idx = titles.index(options)
+display_analysis_data(data[idx])
diff --git a/tldw-scripts/compare.py b/tldw-scripts/compare.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+import json
+import os
+from jinja2 import Template
+import fire
+import yaml
+from copy import copy
+
+def prepare(TEST_LANGUAGE, path, files):
+    out = {}
+    models = []
+
+    for idx, info in enumerate(files):
+        file = os.path.join(path, info['eval'])
+        id = info['id']
+
+        tags = os.path.basename(file).replace('.ndjson', '').split('_')
+        prompt = tags[3]
+        params = tags[5]
+        model = tags[6]
+
+        models.append({'prompt': prompt, 'short_name': info['short_name'], 'params': params, 'model': model, 'id': id, 'idx': idx, 'passed': 0, 'total': 0})
+        results = [json.loads(line) for line in open(file)]
+
+        for r in results:
+            if r['language'] != TEST_LANGUAGE:
+                continue
+
+            testid = r['name']+'-'+r['language']
+            if testid not in out:
+                out[testid] = { 'results': {}, 'task': '', 'language': r['language'] }
+
+            check_summary = ''
+            passing_tests = ''
+            failing_tests = ''
+
+            out[testid]['results'][id] = {
+                'check_summary': check_summary,
+                'passing_tests': passing_tests,
+                'failing_tests': failing_tests,
+                #'code': r['code'],
+                'answer': r['answer']
+            }
+
+            #models[idx]['passed'] += r['passed']
+            #models[idx]['total'] += r['total']
+
+    return { 'tests': out, 'models': models }
+
+def main(config: str, path: str = "./", analyser: str = "", language: str = "english"):
+    cfg = yaml.safe_load(open(config))
+
+    for lang in language.split(','):
+        cfg['language'] = lang
+        print('Comparing results for', lang)
+        data = prepare(cfg['language'], path, cfg['models'])
+        data['config'] = copy(cfg)
+        data['config']['title'] += f" ({lang})"
+        data['analyser'] = analyser
+
+        if analyser != "":
+            analysis(data, analyser)
+
+        outfile = config.replace('.yaml', f'-{lang}.json')
+        with open(outfile, 'w') as f:
+            json.dump(data, f, indent=4)
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/tldw-scripts/merger.py b/tldw-scripts/merger.py
@@ -0,0 +1,29 @@
+import json
+import sys
+
+in_file = sys.argv[1]
+with open(in_file) as infile:
+    chunks = [json.loads(line) for line in infile.readlines()]
+
+def part_to_time(part):
+    mins = part*5
+    oh = mins // 60
+    om = mins % 60
+    return f'{oh:02}:{om:02}'
+
+text = ''
+for idx, chunk in enumerate(chunks):
+    #text += f'\n\n[{part_to_time(idx)} - {part_to_time(idx+1)}] '
+    text += f'\nSection {idx+1}: {chunk["answer"]}\n'
+
+out_file = in_file.replace('ndjson','txt')
+with open(out_file,'w') as outfile:
+    outfile.write(text)
+
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained('hf-internal-testing/llama-tokenizer', use_fast = True)
+logits = tokenizer.encode(text)
+
+print('chunks:', len(chunks))
+print('summary bytes:', len(text))
+print('summary tokens:', len(logits))
diff --git a/tldw-scripts/pyannote.py b/tldw-scripts/pyannote.py
@@ -0,0 +1,10 @@
+from pyannote.audio import Pipeline
+import torch
+pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization").to(torch.device("cuda"))
+
+# 4. apply pretrained pipeline
+diarization = pipeline("lex.wav", num_speakers=2)
+
+# 5. print the result
+for turn, _, speaker in diarization.itertracks(yield_label=True):
+    print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
diff --git a/tldw-scripts/roller-chatgpt-v2.py b/tldw-scripts/roller-chatgpt-v2.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+from jinja2 import Template
+import json
+
+prompt_template = """
+Continue the rolling transcription summary of "{{title}}".  Consider the current context when summarizing the given transcription part.
+
+### Context: {{ context }}
+Speaker-Map: {{ speakermap }}
+
+### Transcription part {{ idx }} of {{ len }}, start time {{ start }}:
+{{ chunk }}
+
+### Instruction: Using the Context above, analyze the Trasncription and respond with a JSON object in this form:
+
+{
+    "Speaker-Map": { "SPEAKER 1": "Bob Dole", "SPEAKER 2": "Jane Doe" } // A map of speakers to their names, make sure to remember all previous speakers.
+    "Next-Context": "..." // An updated context for the next part of the transcription. Always include the speakers and the current topics of discussion.
+    "Summary": "..." // A detailed, point-by-point summary of the current transcription.
+}
+"""
+
+from openai import OpenAI
+
+client = OpenAI()
+
+def main(prefix: str, init_speakers: str = ""):
+    the_template = Template(prompt_template)
+
+    split_segments = json.load(open(prefix+'.chunk.json'))
+    info = json.load(open(prefix+'.info.json'))
+
+    context = f"""
+    Video Title: {info['title']}
+    Video Description: {info['description'][:1024]}
+    """
+
+    speakers = "{ UNKNOWN }"
+
+    f = open(prefix+'.summary.json', 'w')
+    idx = 0
+    for chunk in split_segments:
+        dur = chunk['end'] - chunk['start']
+        print(f"{idx}: {dur}s {len(chunk)}")
+
+        prompt = the_template.render(chunk=chunk['text'], start=chunk['start'], end=chunk['end'],
+                                     idx=idx, len=len(split_segments), context=context, speakermap=speakers, title=info['title'])
+
+        messages = [{'role': 'user', 'content': prompt }]
+        response = client.chat.completions.create(messages=messages,model='gpt-3.5-turbo-1106',temperature=0.1,max_tokens=1024, response_format={ "type": "json_object" })
+
+        answer = response.choices[0].message.content
+
+        parsed = json.loads(answer)
+
+        summary = parsed.get('Summary','')
+        new_speakers = parsed.get('Speaker-Map','')
+        new_context = parsed.get('Next-Context','')
+
+        if summary == '' or new_context == '' or new_speakers == '':
+            print('extraction failed:', new_context, new_speakers, summary)
+            exit(1)
+        else:
+            section = {
+                'start': chunk['start'],
+                'end': chunk['end'],
+                'summary': summary,
+                'speakers': new_speakers,
+                'context': new_context
+            }
+            print('## ', new_speakers)
+            print('>> ', new_context)
+            print(summary)
+            print()
+
+            f.write(json.dumps(section)+'\n')
+            f.flush()
+
+            context = new_context
+            speakers = new_speakers
+
+        idx = idx + 1
+
+if __name__ == "__main__":
+    import fire
+    fire.Fire(main)