Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions app/models/oral_history_content/paragraph.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,19 @@ def initialize(text:, paragraph_index:, speaker_name:)
def word_count
@word_count ||= OralHistoryContent::OhmsXml::LegacyTranscript.word_count(text)
end

# If a paragraph does not begin with a `SPEAKER:` label (usually cause same as last
# one), add it -- for LLM, helpful if every paragraph begins, no assumptions.
def text_with_forced_speaker_label
# if text doesn't already start with speaker, and we HAVE a speaker to add,
# AND the text doesn't start with "[" which is usually used for labels like [END OF TAPE],
# then add a speaker.
if text !~ OralHistoryContent::OhmsXml::LegacyTranscript::OHMS_SPEAKER_LABEL_RE &&
(speaker_name&.strip.presence || assumed_speaker_name&.strip.presence) && ! text.start_with?("[")
"#{speaker_name&.strip.presence || assumed_speaker_name&.strip}: #{text}"
else
text
end
end
end
end
38 changes: 1 addition & 37 deletions app/services/oral_history/claude_interactor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def render_user_prompt(chunks)
ApplicationController.render( template: "claude_interactor/initial_user_prompt",
locals: {
question: question,
formatted_chunks: format_chunks(chunks)
chunks: chunks
},
formats: [:text]
)
Expand All @@ -118,42 +118,6 @@ def get_chunks
chunks
end

def format_chunks(chunks)
separator = "------------------------------"

chunks.collect do |chunk|
# Title is really just for debugging, it can always be fetched by chunk_id, but
# it does make debugging a lot easier to keep the title in the pipeline, to
# footnote.

title = chunk.oral_history_content.work.title

# hackily get a date range
dates = chunk.oral_history_content.work.date_of_work.collect { |d| [d.start, d.finish]}.
flatten.collect(&:presence).compact.uniq.sort

date_string = if dates.length > 1
", #{dates.first.slice(0, 4)}-#{dates.last.slice(0, 4)}"
elsif dates.length > 0
", #{dates.first.slice(0, 4)}"
else
""
end

title = title += date_string

<<~EOS
#{separator}
ORAL HISTORY TITLE: #{title}
CHUNK ID: #{chunk.id}
SPEAKERS: #{chunk.speakers.join(", ")}
PARAGRAPH NUMBERS: #{chunk.start_paragraph_number.upto(chunk.end_paragraph_number).to_a.join(", ")}
TEXT:
#{chunk.text.chomp}
EOS
end.join + "#{separator}"
end

# The thing we asked Claude for, does it look like we asked?
#
# Raises ClaudeInteractor::OutputFormattingError if not
Expand Down
2 changes: 1 addition & 1 deletion app/services/oral_history/transcript_chunker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def build_chunk_record(list_of_paragraphs)
speakers.uniq!

OralHistoryChunk.new(
text: list_of_paragraphs.collect(&:text).join("\n\n"),
text: list_of_paragraphs.collect(&:text_with_forced_speaker_label).join("\n\n"),
oral_history_content: oral_history_content,
start_paragraph_number: list_of_paragraphs.first.paragraph_index,
end_paragraph_number: list_of_paragraphs.last.paragraph_index,
Expand Down
15 changes: 14 additions & 1 deletion app/views/claude_interactor/initial_user_prompt.text.erb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,20 @@ USER QUESTION:
<%= question %>

RETRIEVED CONTEXT CHUNKS:
<%= formatted_chunks %>
<% chunks.each do |chunk| -%>
------------------------------
ORAL HISTORY TITLE: <%= chunk.oral_history_content.work.title %>
ORAL HISTORY ID: OH<%= chunk.oral_history_content.work.oral_history_number %>
CHUNK ID: <%= chunk.id %>
SPEAKERS: <%= chunk.speakers.join(", ") %>
PARAGRAPH NUMBERS: <%= chunk.start_paragraph_number.upto(chunk.end_paragraph_number).to_a.join(", ") %>
TEXT:
<% chunk.text.chomp.split("\n\n").each_with_index do |paragraph, index| %>
[OH<%= chunk.oral_history_content.work.oral_history_number %>|P<%= chunk.start_paragraph_number + index %>] <%= paragraph %>

<% end %>
<%- end -%>
------------------------------

TASK:

Expand Down
41 changes: 30 additions & 11 deletions spec/services/oral_history/claude_interactor_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,47 @@
include AwsBedrockClaudeMockResponse

let(:work) { create(:oral_history_work) }
let(:chunk1) { create(:oral_history_chunk, oral_history_content: work.oral_history_content, speakers: ["SMITH"])}
let(:chunk2) { create(:oral_history_chunk, oral_history_content: work.oral_history_content, speakers: ["SMITH", "JONES"], text: "Chunk 2")}

let(:chunk1) { create(:oral_history_chunk,
oral_history_content: work.oral_history_content,
speakers: ["SMITH"],
start_paragraph_number: 12,
end_paragraph_number: 12,
text: "SMITH: If you think back to your time together at school, what kind of a student was Gordon?")}

let(:chunk2) { create(:oral_history_chunk,
oral_history_content: work.oral_history_content,
speakers: ["SMITH", "JONES"],
start_paragraph_number: 12,
end_paragraph_number: 13,
text: "SMITH: If you think back to your time together at school, what kind of a student was Gordon?\n\nJONES: He was a good student. He has always been a tremendous student. Even in grammar school. But, he was a year ahead of me. I wasn't in his class.")}

let(:interaction) { described_class.new(question: "What are scientists like?", question_embedding: OralHistoryChunk::FAKE_EMBEDDING) }

describe "#format_chunks" do
it "formats" do
expect(interaction.format_chunks([chunk1, chunk2]).strip).to eq <<~EOS.strip
describe "render_user_promopt" do
it "includes formatted chunks" do
expect(interaction.render_user_prompt([chunk1, chunk2])).to include <<~EOS.strip
RETRIEVED CONTEXT CHUNKS:
------------------------------
ORAL HISTORY TITLE: Oral history interview with William John Bailey, 1986
ORAL HISTORY TITLE: Oral history interview with William John Bailey
ORAL HISTORY ID: OH#{chunk1.oral_history_content.work.oral_history_number}
CHUNK ID: #{chunk1.id}
SPEAKERS: SMITH
PARAGRAPH NUMBERS: 12, 13, 14, 15
PARAGRAPH NUMBERS: 12
TEXT:
#{chunk1.text.chomp}
[OH0012|P12] SMITH: If you think back to your time together at school, what kind of a student was Gordon?

------------------------------
ORAL HISTORY TITLE: Oral history interview with William John Bailey, 1986
ORAL HISTORY TITLE: Oral history interview with William John Bailey
ORAL HISTORY ID: OH#{chunk2.oral_history_content.work.oral_history_number}
CHUNK ID: #{chunk2.id}
SPEAKERS: SMITH, JONES
PARAGRAPH NUMBERS: 12, 13, 14, 15
PARAGRAPH NUMBERS: 12, 13
TEXT:
#{chunk2.text.chomp}
[OH0012|P12] SMITH: If you think back to your time together at school, what kind of a student was Gordon?

[OH0012|P13] JONES: He was a good student. He has always been a tremendous student. Even in grammar school. But, he was a year ahead of me. I wasn't in his class.

------------------------------
EOS
end
Expand Down
17 changes: 17 additions & 0 deletions spec/services/oral_history/transcript_chunker_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,23 @@ def word_count(*strings)
expect(timestamp_data["previous"]).to eq paragraph.previous_timestamp
end
end

describe "paragraphs without speaker labels" do
let(:speaker_label_regexp) { /\A[A-Z]+\:/ }

let(:list_of_paragraphs) do
legacy_transcript.paragraphs.slice(326, 4).tap do |list|
expect(list.first).not_to match speaker_label_regexp
end
end

it "get their assumed speaker labels included" do
record = chunker.build_chunk_record(list_of_paragraphs)

expect(record.text).to match speaker_label_regexp
expect(record.text.split("\n\n")).to all match speaker_label_regexp
end
end
end

describe "#create_db_records" do
Expand Down