sciencehistory · jrochkind · Dec 22, 2025 · Dec 22, 2025 · Dec 22, 2025 · Dec 22, 2025
diff --git a/app/jobs/oh_transcript_chunker_job.rb b/app/jobs/oh_transcript_chunker_job.rb
@@ -13,12 +13,7 @@ def perform(oral_history_content, delete_existing: false)
       end
     end
 
-    # check to make sure we are OHMS legacy, that's all we can do right now.
-    unless oral_history_content.ohms_xml.present? && oral_history_content.ohms_xml.legacy_transcript.present?
-      raise RuntimeError.new("We only know how to process legacy OHMS xml at present, can't process OralHistoryContent #{oral_history_content.id}")
-    end
-
-    OralHistory::OhmsLegacyTranscriptChunker.new(oral_history_content: oral_history_content, allow_embedding_wait_seconds: 10).create_db_records
+    OralHistory::TranscriptChunker.new(oral_history_content: oral_history_content, allow_embedding_wait_seconds: 10).create_db_records
   end
 
 end
diff --git a/app/models/oral_history_content/ohms_xml/legacy_transcript.rb b/app/models/oral_history_content/ohms_xml/legacy_transcript.rb
@@ -211,28 +211,14 @@ def self.word_count(*strings)
         strings.collect { |s| s.scan(/\w+/).count }.sum
       end
 
-      # holds an ordered list of Line's, and can describe
-      class Paragraph
+      # subclass for describing our paragraphs, with extra behaivor that is line-based,
+      # as Legacy OHMS format is line-based.
+      class Paragraph < ::OralHistoryContent::Paragraph
         # @return [Array<OralHistoryContent::LegacyTranscript::Line>] ordered list of Line objects
         attr_reader :lines
 
         attr_reader :transcript_id
 
-        # @return [integer] 1-based index of paragraph in document
-        attr_reader :paragraph_index
-
-        # @return [Array<Integer>] list of timestamps (as seconds) included in ths paragraph
-        attr_accessor :included_timestamps
-
-        # @return [Integer] timestamp in seconds of the PREVIOUS timestamp to this paragraph,
-        #                   to latest the timestamp sure not to miss beginning of paragraph.
-        attr_accessor :previous_timestamp
-
-        # @return [String] when the paragraph has no speaker name internally, we guess/assume
-        #    it has the same speaker as previous paragraph. Store such an assumed speaker name
-        #    from previous paragraph here.
-        attr_accessor :assumed_speaker_name
-
         def initialize(lines = nil, paragraph_index:)
           @lines = lines || []
           @paragraph_index = paragraph_index
@@ -250,28 +236,24 @@ def text
           @lines.collect {|s| s.text.chomp }.join(" ").strip
         end
 
-        def word_count
-          @word_count ||= OralHistoryContent::OhmsXml::LegacyTranscript.word_count(text)
-        end
-
         # @return [Range] from first to last line number, with line numbers being 1-indexed
         #                 in entire document.
         def line_number_range
           (@[email protected]_num)
         end
 
-        # @return [String] to be used as an anchor within an HTML doc, that can be targeted
-        #                  with a link
-        def fragment_id
-          "oh-t#{transcript_id}-p#{paragraph_index}"
-        end
-
         # @return [String] speaker name from any speaker label. Can be nil. Assumes
         #                  whole paragraph is one speaker, identified on first line, which
         #                  SHOULD be true, but weird things may happen if it ain't.
         def speaker_name
           lines.first&.speaker_label&.chomp(":")
         end
+
+        # @return [String] to be used as an `id` attribute within an HTML doc, identifying a particular
+        #         paragraph.
+        def fragment_id
+          "oh-t#{transcript_id}-p#{paragraph_index}"
+        end
       end
 
       class Line

diff --git a/app/models/oral_history_content/paragraph.rb b/app/models/oral_history_content/paragraph.rb
@@ -0,0 +1,42 @@
+class OralHistoryContent
+
+  # A model for an Oral History paragraph, used by the Chunker to make chunks.
+  #
+  # Different classes can create these, depending on what format of transcript they are creating
+  # from (OHMS, plain text, etc)  in some cases there may be subsets.
+  #
+  # Some may create sub-classes specific to their format, but this is a general API for chunkers.
+  class Paragraph
+    attr_reader :transcript_id
+
+    # @return [integer] 1-based index of paragraph in document
+    attr_reader :paragraph_index
+
+    attr_reader :text
+
+    # @return [Array<Integer>] list of timestamps (as seconds) included in ths paragraph
+    attr_accessor :included_timestamps
+
+    # @return [Integer] timestamp in seconds of the PREVIOUS timestamp to this paragraph,
+    #                   to latest the timestamp sure not to miss beginning of paragraph.
+    attr_accessor :previous_timestamp
+
+    # @return [String] when the paragraph has no speaker name internally, we guess/assume
+    #    it has the same speaker as previous paragraph. Store such an assumed speaker name
+    #    from previous paragraph here.
+    attr_accessor :assumed_speaker_name
+
+    # OHMS transcript sub-classes get these from OHMS transcript model classes
+    attr_accessor :speaker_name, :text
+
+    def initialize(text:, paragraph_index:, speaker_name:)
+      @text = text
+      @paragraph_index = paragraph_index
+      @speaker_name = speaker_name
+    end
+
+    def word_count
+      @word_count ||= OralHistoryContent::OhmsXml::LegacyTranscript.word_count(text)
+    end
+  end
+end
diff --git a/app/services/oral_history/plain_text_paragraph_splitter.rb b/app/services/oral_history/plain_text_paragraph_splitter.rb
@@ -0,0 +1,88 @@
+module OralHistory
+  # Takes a plain text OH transcript, such as included for searching in the OralHistoryContent#searchable_transcript_source
+  # field, and splits it into OralHistoryContent::Paragraph objects, for use by chunker.
+  #
+  # Note that this does not let us know what PDF page the paragraph is on, which might be nice
+  # for citing. And we have no timestamps, no sync timestamps in these transcripts.
+  #
+  # This may be a temporary interim implementation, to demo AI vector search of this content,
+  # while we figure out how we want to do better citing/linking.
+  #
+  class PlainTextParagraphSplitter
+    attr_reader :plain_text
+
+    def initialize(plain_text:)
+      @plain_text = plain_text
+    end
+
+    # @return OralHistoryContent::Paragraph
+    def paragraphs
+      @paragraphs ||= split_paragraphs
+    end
+
+    private
+
+    def split_paragraphs
+      last_speaker_name = nil
+      current_speaker_name = nil
+      paragraph_index = 0
+
+      things = trim_transcript(plain_text).split(/(?:\r?\n\s*){2,}/).collect do |raw_paragraph|
+        raw_paragraph.strip!
+
+        # There is some metadata that comes not only at beginning but sometimes in the middle
+        # after new tape/interview session. We don't want it.
+        next if looks_like_metadata_line?(raw_paragraph)
+
+        current_speaker_name = nil
+        # While this is not an OHMS transcript, the regex extracted from OHMS works well
+        if raw_paragraph =~ OralHistoryContent::OhmsXml::LegacyTranscript::OHMS_SPEAKER_LABEL_RE
+          current_speaker_name = $1.chomp(":")
+        end
+
+        paragraph = OralHistoryContent::Paragraph.new(speaker_name: current_speaker_name,
+                                                      paragraph_index: paragraph_index,
+                                                      text: raw_paragraph)
+        if paragraph.speaker_name.blank?
+          paragraph.assumed_speaker_name = last_speaker_name
+        end
+
+
+        last_speaker_name = current_speaker_name
+        paragraph_index +=1
+
+        paragraph
+      end.compact
+    end
+
+    def looks_like_metadata_line?(str)
+      # if it's one line, with one of our known metadata labels, colon, some info
+      str =~ /\A\s*(INTERVIEWEE|INTERVIEWER|DATE|LOCATION):.+$/ ||
+        # Also for now just avoid the [END OF ...] markers.
+        str =~ /\A\[END OF INTERVIEW.*\]\s*$/ ||
+        str =~ /\A\[END OF TAPE.*\]\s*$/
+    end
+
+    # Trim END after last [END OF INTERVEW] marker -- get rid of footnote and index.
+    def trim_transcript(plain_text)
+      plain_text = plain_text.dup
+
+      # we sometimes have unicode BOM and nonsense in there
+      plain_text.gsub!(/[\u200B\uFEFF]/, '')
+
+      # Interview often  strip the LAST one in the transcript and anythi8ng after it
+      # , we'll use negative lookahead to be "last one, not another one after it"
+      if plain_text =~ /\[END OF INTERVIEW( \d+)?\]/
+        plain_text.gsub!(/\[END OF INTERVIEW( \d+)?\](?!.*\[END OF INTERVIEW).*/m, '')
+      elsif plain_text =~ /NOTES|INDEX/
+        # But sometimes they don't, but still have a NOTES and/OR INDEX? On a line by itself,
+        # eliminate with everything afterwords.
+        plain_text.gsub!(/^NOTES|INDEX$.*/m, '')
+      end
+
+      plain_text.strip!
+
+      plain_text
+    end
+  end
+end
diff --git a/...history/ohms_legacy_transcript_chunker.rb → ...rvices/oral_history/transcript_chunker.rb b/...history/ohms_legacy_transcript_chunker.rb → ...rvices/oral_history/transcript_chunker.rb
@@ -6,7 +6,7 @@ module OralHistory
   # a few transcripts have multiple interviewers or intervieweees!), but that is just done
   # approximately while also staying within a minimum and max word count for a chunk more strictly.
   #
-  class OhmsLegacyTranscriptChunker
+  class TranscriptChunker
     # always want more than this many words
     LOWER_WORD_LIMIT = 260
 
@@ -23,7 +23,10 @@ class OhmsLegacyTranscriptChunker
 
     EMBEDDING_RETRY_WAIT = 5
 
-    attr_reader :transcript,  :interviewee_names, :oral_history_content
+    attr_reader :interviewee_names, :oral_history_content
+
+    # @attribute paragraphs [Array<OralHistoryContent::Paragraph>]
+    attr_reader :paragraphs
 
 
     # @param allow_embedding_wait_seconds [Integer] if we exceed open ai rate limit for getting
@@ -34,12 +37,24 @@ def initialize(oral_history_content:, allow_embedding_wait_seconds: 0)
         raise ArgumentError.new("argument must be OralHistoryContent, but was #{oral_history_content.class.name}")
       end
 
-      unless oral_history_content.ohms_xml.legacy_transcript.present?
-        raise ArgumentError.new("#{self.class.name} can only be used with a LegacyTranscript, but argument does not have one: #{oral_history_content.inspect}")
-      end
-
       @oral_history_content = oral_history_content
-      @transcript = oral_history_content.ohms_xml.legacy_transcript
+
+      # different ways of extracting paragraphs, they all should return array of OralHistoryContent::Paragraph
+      @paragraphs = if oral_history_content.ohms_xml&.legacy_transcript.present?
+        oral_history_content.ohms_xml.legacy_transcript.paragraphs
+
+      elsif oral_history_content.ohms_xml
+        # TODO, new style transcript
+        raise ArgumentError.new("#{self.class.name} can only be used with OHMS transcripts if they are legacy: #{oral_history_content.inspect}")
+
+      elsif oral_history_content.searchable_transcript_source.present?
+        OralHistory::PlainTextParagraphSplitter.new(
+          plain_text: oral_history_content.searchable_transcript_source
+        ).paragraphs
+
+      else
+        raise ArgumentError.new("#{self.class.name} can't find paragraph source content for: #{oral_history_content.inspect}")
+      end
 
       # For matching to speaker names, assume it's "lastname, first dates" type heading,
       # take last name and upcase
@@ -133,7 +148,7 @@ def split_chunks
       current_chunk = []
       paragraph_speaker_name = nil
 
-      transcript.paragraphs.each do |paragraph|
+      paragraphs.each do |paragraph|
         last_paragraph_speaker_name = paragraph_speaker_name
 
         # only change speaker name if we have one, otherwise leave last one
@@ -158,8 +173,8 @@ def split_chunks
         # one with the presumed question. end_with? is used for some weird "multi-interviewee with
         # same name" use cases, good enough.
         elsif prospective_count >= WORD_GOAL &&
-              !interviewee_names.find { |n| paragraph_speaker_name.end_with? n }  &&
-              interviewee_names.find { |n| last_paragraph_speaker_name.end_with? n }
+              !interviewee_names.find { |n| paragraph_speaker_name&.end_with? n }  &&
+              interviewee_names.find { |n| last_paragraph_speaker_name&.end_with? n }
           chunks << current_chunk
 
           overlap_paragraphs = (chunks.last || []).last(1)