sciencehistory · jrochkind · Jan 12, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/app/models/oral_history_content/paragraph.rb b/app/models/oral_history_content/paragraph.rb
@@ -38,5 +38,19 @@ def initialize(text:, paragraph_index:, speaker_name:)
     def word_count
       @word_count ||= OralHistoryContent::OhmsXml::LegacyTranscript.word_count(text)
     end
+
+    # If a paragraph does not begin with a `SPEAKER:` label (usually cause same as last
+    # one), add it -- for LLM, helpful if every paragraph begins, no assumptions.
+    def text_with_forced_speaker_label
+      # if text doesn't already start with speaker, and we HAVE a speaker to add,
+      # AND the text doesn't start with "[" which is usually used for labels like [END OF TAPE],
+      # then add a speaker.
+      if text !~ OralHistoryContent::OhmsXml::LegacyTranscript::OHMS_SPEAKER_LABEL_RE &&
+            (speaker_name&.strip.presence || assumed_speaker_name&.strip.presence) && ! text.start_with?("[")
+        "#{speaker_name&.strip.presence || assumed_speaker_name&.strip}: #{text}"
+      else
+        text
+      end
+    end
   end
 end
diff --git a/app/services/oral_history/claude_interactor.rb b/app/services/oral_history/claude_interactor.rb
@@ -98,7 +98,7 @@ def render_user_prompt(chunks)
       ApplicationController.render( template: "claude_interactor/initial_user_prompt",
                                     locals: {
                                       question: question,
-                                      formatted_chunks: format_chunks(chunks)
+                                      chunks: chunks
                                     },
                                     formats: [:text]
                                   )
@@ -118,42 +118,6 @@ def get_chunks
       chunks
     end
 
-    def format_chunks(chunks)
-      separator = "------------------------------"
-
-      chunks.collect do |chunk|
-        # Title is really just for debugging, it can always be fetched by chunk_id, but
-        # it does make debugging a lot easier to keep the title in the pipeline, to
-        # footnote.
-
-        title = chunk.oral_history_content.work.title
-
-        # hackily get a date range
-        dates = chunk.oral_history_content.work.date_of_work.collect { |d| [d.start, d.finish]}.
-          flatten.collect(&:presence).compact.uniq.sort
-
-        date_string = if dates.length > 1
-          ", #{dates.first.slice(0, 4)}-#{dates.last.slice(0, 4)}"
-        elsif dates.length > 0
-          ", #{dates.first.slice(0, 4)}"
-        else
-          ""
-        end
-
-        title = title += date_string
-
-        <<~EOS
-          #{separator}
-          ORAL HISTORY TITLE: #{title}
-          CHUNK ID: #{chunk.id}
-          SPEAKERS: #{chunk.speakers.join(", ")}
-          PARAGRAPH NUMBERS: #{chunk.start_paragraph_number.upto(chunk.end_paragraph_number).to_a.join(", ")}
-          TEXT:
-          #{chunk.text.chomp}
-        EOS
-      end.join + "#{separator}"
-    end
-
     # The thing we asked Claude for, does it look like we asked?
     #
     # Raises ClaudeInteractor::OutputFormattingError if not

diff --git a/app/services/oral_history/transcript_chunker.rb b/app/services/oral_history/transcript_chunker.rb
@@ -229,7 +229,7 @@ def build_chunk_record(list_of_paragraphs)
       speakers.uniq!
 
       OralHistoryChunk.new(
-        text: list_of_paragraphs.collect(&:text).join("\n\n"),
+        text: list_of_paragraphs.collect(&:text_with_forced_speaker_label).join("\n\n"),
         oral_history_content: oral_history_content,
         start_paragraph_number: list_of_paragraphs.first.paragraph_index,
         end_paragraph_number: list_of_paragraphs.last.paragraph_index,

diff --git a/app/views/claude_interactor/initial_user_prompt.text.erb b/app/views/claude_interactor/initial_user_prompt.text.erb
@@ -2,7 +2,20 @@ USER QUESTION:
 <%= question %>
 
 RETRIEVED CONTEXT CHUNKS:
-<%= formatted_chunks %>
+<% chunks.each do |chunk| -%>
+------------------------------
+ORAL HISTORY TITLE: <%= chunk.oral_history_content.work.title %>
+ORAL HISTORY ID: OH<%= chunk.oral_history_content.work.oral_history_number %>
+CHUNK ID: <%= chunk.id %>
+SPEAKERS: <%= chunk.speakers.join(", ") %>
+PARAGRAPH NUMBERS: <%= chunk.start_paragraph_number.upto(chunk.end_paragraph_number).to_a.join(", ") %>
+TEXT:
+<% chunk.text.chomp.split("\n\n").each_with_index do |paragraph, index| %>
+[OH<%= chunk.oral_history_content.work.oral_history_number %>|P<%= chunk.start_paragraph_number + index %>] <%= paragraph %>
+
+<% end %>
+<%- end -%>
+------------------------------
 
 TASK:
 

diff --git a/spec/services/oral_history/claude_interactor_spec.rb b/spec/services/oral_history/claude_interactor_spec.rb
@@ -4,28 +4,47 @@
   include AwsBedrockClaudeMockResponse
 
   let(:work) { create(:oral_history_work) }
-  let(:chunk1) { create(:oral_history_chunk, oral_history_content: work.oral_history_content, speakers: ["SMITH"])}
-  let(:chunk2) { create(:oral_history_chunk, oral_history_content: work.oral_history_content, speakers: ["SMITH", "JONES"], text: "Chunk 2")}
+
+  let(:chunk1) { create(:oral_history_chunk,
+    oral_history_content: work.oral_history_content,
+    speakers: ["SMITH"],
+    start_paragraph_number: 12,
+    end_paragraph_number: 12,
+    text: "SMITH: If you think back to your time together at school, what kind of a student was Gordon?")}
+
+  let(:chunk2) { create(:oral_history_chunk,
+    oral_history_content: work.oral_history_content,
+    speakers: ["SMITH", "JONES"],
+    start_paragraph_number: 12,
+    end_paragraph_number: 13,
+    text: "SMITH: If you think back to your time together at school, what kind of a student was Gordon?\n\nJONES: He was a good student. He has always been a tremendous student. Even in grammar school. But, he was a year ahead of me. I wasn't in his class.")}
 
   let(:interaction) { described_class.new(question: "What are scientists like?", question_embedding: OralHistoryChunk::FAKE_EMBEDDING) }
 
-  describe "#format_chunks" do
-    it "formats" do
-      expect(interaction.format_chunks([chunk1, chunk2]).strip).to eq <<~EOS.strip
+  describe "render_user_promopt" do
+    it "includes formatted chunks" do
+      expect(interaction.render_user_prompt([chunk1, chunk2])).to include <<~EOS.strip
+       RETRIEVED CONTEXT CHUNKS:
        ------------------------------
-       ORAL HISTORY TITLE: Oral history interview with William John Bailey, 1986
+       ORAL HISTORY TITLE: Oral history interview with William John Bailey
+       ORAL HISTORY ID: OH#{chunk1.oral_history_content.work.oral_history_number}
        CHUNK ID: #{chunk1.id}
        SPEAKERS: SMITH
-       PARAGRAPH NUMBERS: 12, 13, 14, 15
+       PARAGRAPH NUMBERS: 12
        TEXT:
-       #{chunk1.text.chomp}
+       [OH0012|P12] SMITH: If you think back to your time together at school, what kind of a student was Gordon?
+
        ------------------------------
-       ORAL HISTORY TITLE: Oral history interview with William John Bailey, 1986
+       ORAL HISTORY TITLE: Oral history interview with William John Bailey
+       ORAL HISTORY ID: OH#{chunk2.oral_history_content.work.oral_history_number}
        CHUNK ID: #{chunk2.id}
        SPEAKERS: SMITH, JONES
-       PARAGRAPH NUMBERS: 12, 13, 14, 15
+       PARAGRAPH NUMBERS: 12, 13
        TEXT:
-       #{chunk2.text.chomp}
+       [OH0012|P12] SMITH: If you think back to your time together at school, what kind of a student was Gordon?
+
+       [OH0012|P13] JONES: He was a good student. He has always been a tremendous student. Even in grammar school. But, he was a year ahead of me. I wasn't in his class.
+
        ------------------------------
       EOS
     end

diff --git a/spec/services/oral_history/transcript_chunker_spec.rb b/spec/services/oral_history/transcript_chunker_spec.rb
@@ -101,6 +101,23 @@ def word_count(*strings)
           expect(timestamp_data["previous"]).to eq paragraph.previous_timestamp
         end
       end
+
+      describe "paragraphs without speaker labels" do
+        let(:speaker_label_regexp) { /\A[A-Z]+\:/ }
+
+        let(:list_of_paragraphs) do
+          legacy_transcript.paragraphs.slice(326, 4).tap do |list|
+            expect(list.first).not_to match speaker_label_regexp
+          end
+        end
+
+        it "get their assumed speaker labels included" do
+          record = chunker.build_chunk_record(list_of_paragraphs)
+
+          expect(record.text).to match speaker_label_regexp
+          expect(record.text.split("\n\n")).to all match speaker_label_regexp
+        end
+      end
     end
 
     describe "#create_db_records" do