diff --git a/app/models/oral_history_content/paragraph.rb b/app/models/oral_history_content/paragraph.rb index d81e2474d..e06b346fd 100644 --- a/app/models/oral_history_content/paragraph.rb +++ b/app/models/oral_history_content/paragraph.rb @@ -38,5 +38,19 @@ def initialize(text:, paragraph_index:, speaker_name:) def word_count @word_count ||= OralHistoryContent::OhmsXml::LegacyTranscript.word_count(text) end + + # If a paragraph does not begin with a `SPEAKER:` label (usually cause same as last + # one), add it -- for LLM, helpful if every paragraph begins, no assumptions. + def text_with_forced_speaker_label + # if text doesn't already start with speaker, and we HAVE a speaker to add, + # AND the text doesn't start with "[" which is usually used for labels like [END OF TAPE], + # then add a speaker. + if text !~ OralHistoryContent::OhmsXml::LegacyTranscript::OHMS_SPEAKER_LABEL_RE && + (speaker_name&.strip.presence || assumed_speaker_name&.strip.presence) && ! text.start_with?("[") + "#{speaker_name&.strip.presence || assumed_speaker_name&.strip}: #{text}" + else + text + end + end end end diff --git a/app/services/oral_history/claude_interactor.rb b/app/services/oral_history/claude_interactor.rb index c532220ff..413823dbe 100644 --- a/app/services/oral_history/claude_interactor.rb +++ b/app/services/oral_history/claude_interactor.rb @@ -98,7 +98,7 @@ def render_user_prompt(chunks) ApplicationController.render( template: "claude_interactor/initial_user_prompt", locals: { question: question, - formatted_chunks: format_chunks(chunks) + chunks: chunks }, formats: [:text] ) @@ -118,42 +118,6 @@ def get_chunks chunks end - def format_chunks(chunks) - separator = "------------------------------" - - chunks.collect do |chunk| - # Title is really just for debugging, it can always be fetched by chunk_id, but - # it does make debugging a lot easier to keep the title in the pipeline, to - # footnote. - - title = chunk.oral_history_content.work.title - - # hackily get a date range - dates = chunk.oral_history_content.work.date_of_work.collect { |d| [d.start, d.finish]}. - flatten.collect(&:presence).compact.uniq.sort - - date_string = if dates.length > 1 - ", #{dates.first.slice(0, 4)}-#{dates.last.slice(0, 4)}" - elsif dates.length > 0 - ", #{dates.first.slice(0, 4)}" - else - "" - end - - title = title += date_string - - <<~EOS - #{separator} - ORAL HISTORY TITLE: #{title} - CHUNK ID: #{chunk.id} - SPEAKERS: #{chunk.speakers.join(", ")} - PARAGRAPH NUMBERS: #{chunk.start_paragraph_number.upto(chunk.end_paragraph_number).to_a.join(", ")} - TEXT: - #{chunk.text.chomp} - EOS - end.join + "#{separator}" - end - # The thing we asked Claude for, does it look like we asked? # # Raises ClaudeInteractor::OutputFormattingError if not diff --git a/app/services/oral_history/transcript_chunker.rb b/app/services/oral_history/transcript_chunker.rb index baa2ebd69..fb8a81482 100644 --- a/app/services/oral_history/transcript_chunker.rb +++ b/app/services/oral_history/transcript_chunker.rb @@ -229,7 +229,7 @@ def build_chunk_record(list_of_paragraphs) speakers.uniq! OralHistoryChunk.new( - text: list_of_paragraphs.collect(&:text).join("\n\n"), + text: list_of_paragraphs.collect(&:text_with_forced_speaker_label).join("\n\n"), oral_history_content: oral_history_content, start_paragraph_number: list_of_paragraphs.first.paragraph_index, end_paragraph_number: list_of_paragraphs.last.paragraph_index, diff --git a/app/views/claude_interactor/initial_user_prompt.text.erb b/app/views/claude_interactor/initial_user_prompt.text.erb index 67e6f4303..fc4b5ab32 100644 --- a/app/views/claude_interactor/initial_user_prompt.text.erb +++ b/app/views/claude_interactor/initial_user_prompt.text.erb @@ -2,7 +2,20 @@ USER QUESTION: <%= question %> RETRIEVED CONTEXT CHUNKS: -<%= formatted_chunks %> +<% chunks.each do |chunk| -%> +------------------------------ +ORAL HISTORY TITLE: <%= chunk.oral_history_content.work.title %> +ORAL HISTORY ID: OH<%= chunk.oral_history_content.work.oral_history_number %> +CHUNK ID: <%= chunk.id %> +SPEAKERS: <%= chunk.speakers.join(", ") %> +PARAGRAPH NUMBERS: <%= chunk.start_paragraph_number.upto(chunk.end_paragraph_number).to_a.join(", ") %> +TEXT: +<% chunk.text.chomp.split("\n\n").each_with_index do |paragraph, index| %> +[OH<%= chunk.oral_history_content.work.oral_history_number %>|P<%= chunk.start_paragraph_number + index %>] <%= paragraph %> + +<% end %> +<%- end -%> +------------------------------ TASK: diff --git a/spec/services/oral_history/claude_interactor_spec.rb b/spec/services/oral_history/claude_interactor_spec.rb index 3156538de..297a71bb8 100644 --- a/spec/services/oral_history/claude_interactor_spec.rb +++ b/spec/services/oral_history/claude_interactor_spec.rb @@ -4,28 +4,47 @@ include AwsBedrockClaudeMockResponse let(:work) { create(:oral_history_work) } - let(:chunk1) { create(:oral_history_chunk, oral_history_content: work.oral_history_content, speakers: ["SMITH"])} - let(:chunk2) { create(:oral_history_chunk, oral_history_content: work.oral_history_content, speakers: ["SMITH", "JONES"], text: "Chunk 2")} + + let(:chunk1) { create(:oral_history_chunk, + oral_history_content: work.oral_history_content, + speakers: ["SMITH"], + start_paragraph_number: 12, + end_paragraph_number: 12, + text: "SMITH: If you think back to your time together at school, what kind of a student was Gordon?")} + + let(:chunk2) { create(:oral_history_chunk, + oral_history_content: work.oral_history_content, + speakers: ["SMITH", "JONES"], + start_paragraph_number: 12, + end_paragraph_number: 13, + text: "SMITH: If you think back to your time together at school, what kind of a student was Gordon?\n\nJONES: He was a good student. He has always been a tremendous student. Even in grammar school. But, he was a year ahead of me. I wasn't in his class.")} let(:interaction) { described_class.new(question: "What are scientists like?", question_embedding: OralHistoryChunk::FAKE_EMBEDDING) } - describe "#format_chunks" do - it "formats" do - expect(interaction.format_chunks([chunk1, chunk2]).strip).to eq <<~EOS.strip + describe "render_user_promopt" do + it "includes formatted chunks" do + expect(interaction.render_user_prompt([chunk1, chunk2])).to include <<~EOS.strip + RETRIEVED CONTEXT CHUNKS: ------------------------------ - ORAL HISTORY TITLE: Oral history interview with William John Bailey, 1986 + ORAL HISTORY TITLE: Oral history interview with William John Bailey + ORAL HISTORY ID: OH#{chunk1.oral_history_content.work.oral_history_number} CHUNK ID: #{chunk1.id} SPEAKERS: SMITH - PARAGRAPH NUMBERS: 12, 13, 14, 15 + PARAGRAPH NUMBERS: 12 TEXT: - #{chunk1.text.chomp} + [OH0012|P12] SMITH: If you think back to your time together at school, what kind of a student was Gordon? + ------------------------------ - ORAL HISTORY TITLE: Oral history interview with William John Bailey, 1986 + ORAL HISTORY TITLE: Oral history interview with William John Bailey + ORAL HISTORY ID: OH#{chunk2.oral_history_content.work.oral_history_number} CHUNK ID: #{chunk2.id} SPEAKERS: SMITH, JONES - PARAGRAPH NUMBERS: 12, 13, 14, 15 + PARAGRAPH NUMBERS: 12, 13 TEXT: - #{chunk2.text.chomp} + [OH0012|P12] SMITH: If you think back to your time together at school, what kind of a student was Gordon? + + [OH0012|P13] JONES: He was a good student. He has always been a tremendous student. Even in grammar school. But, he was a year ahead of me. I wasn't in his class. + ------------------------------ EOS end diff --git a/spec/services/oral_history/transcript_chunker_spec.rb b/spec/services/oral_history/transcript_chunker_spec.rb index 7d81cd7e7..e1027d55c 100644 --- a/spec/services/oral_history/transcript_chunker_spec.rb +++ b/spec/services/oral_history/transcript_chunker_spec.rb @@ -101,6 +101,23 @@ def word_count(*strings) expect(timestamp_data["previous"]).to eq paragraph.previous_timestamp end end + + describe "paragraphs without speaker labels" do + let(:speaker_label_regexp) { /\A[A-Z]+\:/ } + + let(:list_of_paragraphs) do + legacy_transcript.paragraphs.slice(326, 4).tap do |list| + expect(list.first).not_to match speaker_label_regexp + end + end + + it "get their assumed speaker labels included" do + record = chunker.build_chunk_record(list_of_paragraphs) + + expect(record.text).to match speaker_label_regexp + expect(record.text.split("\n\n")).to all match speaker_label_regexp + end + end end describe "#create_db_records" do