Skip to content
Draft
7 changes: 1 addition & 6 deletions app/jobs/oh_transcript_chunker_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,7 @@ def perform(oral_history_content, delete_existing: false)
end
end

# check to make sure we are OHMS legacy, that's all we can do right now.
unless oral_history_content.ohms_xml.present? && oral_history_content.ohms_xml.legacy_transcript.present?
raise RuntimeError.new("We only know how to process legacy OHMS xml at present, can't process OralHistoryContent #{oral_history_content.id}")
end

OralHistory::OhmsLegacyTranscriptChunker.new(oral_history_content: oral_history_content, allow_embedding_wait_seconds: 10).create_db_records
OralHistory::TranscriptChunker.new(oral_history_content: oral_history_content, allow_embedding_wait_seconds: 10).create_db_records
end

end
36 changes: 9 additions & 27 deletions app/models/oral_history_content/ohms_xml/legacy_transcript.rb
Original file line number Diff line number Diff line change
Expand Up @@ -211,28 +211,14 @@ def self.word_count(*strings)
strings.collect { |s| s.scan(/\w+/).count }.sum
end

# holds an ordered list of Line's, and can describe
class Paragraph
# subclass for describing our paragraphs, with extra behaivor that is line-based,
# as Legacy OHMS format is line-based.
class Paragraph < ::OralHistoryContent::Paragraph
# @return [Array<OralHistoryContent::LegacyTranscript::Line>] ordered list of Line objects
attr_reader :lines

attr_reader :transcript_id

# @return [integer] 1-based index of paragraph in document
attr_reader :paragraph_index

# @return [Array<Integer>] list of timestamps (as seconds) included in ths paragraph
attr_accessor :included_timestamps

# @return [Integer] timestamp in seconds of the PREVIOUS timestamp to this paragraph,
# to latest the timestamp sure not to miss beginning of paragraph.
attr_accessor :previous_timestamp

# @return [String] when the paragraph has no speaker name internally, we guess/assume
# it has the same speaker as previous paragraph. Store such an assumed speaker name
# from previous paragraph here.
attr_accessor :assumed_speaker_name

def initialize(lines = nil, paragraph_index:)
@lines = lines || []
@paragraph_index = paragraph_index
Expand All @@ -250,28 +236,24 @@ def text
@lines.collect {|s| s.text.chomp }.join(" ").strip
end

def word_count
@word_count ||= OralHistoryContent::OhmsXml::LegacyTranscript.word_count(text)
end

# @return [Range] from first to last line number, with line numbers being 1-indexed
# in entire document.
def line_number_range
(@[email protected]_num)
end

# @return [String] to be used as an anchor within an HTML doc, that can be targeted
# with a link
def fragment_id
"oh-t#{transcript_id}-p#{paragraph_index}"
end

# @return [String] speaker name from any speaker label. Can be nil. Assumes
# whole paragraph is one speaker, identified on first line, which
# SHOULD be true, but weird things may happen if it ain't.
def speaker_name
lines.first&.speaker_label&.chomp(":")
end

# @return [String] to be used as an `id` attribute within an HTML doc, identifying a particular
# paragraph.
def fragment_id
"oh-t#{transcript_id}-p#{paragraph_index}"
end
end

class Line
Expand Down
42 changes: 42 additions & 0 deletions app/models/oral_history_content/paragraph.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
class OralHistoryContent

# A model for an Oral History paragraph, used by the Chunker to make chunks.
#
# Different classes can create these, depending on what format of transcript they are creating
# from (OHMS, plain text, etc) in some cases there may be subsets.
#
# Some may create sub-classes specific to their format, but this is a general API for chunkers.
class Paragraph
attr_reader :transcript_id

# @return [integer] 1-based index of paragraph in document
attr_reader :paragraph_index

attr_reader :text

# @return [Array<Integer>] list of timestamps (as seconds) included in ths paragraph
attr_accessor :included_timestamps

# @return [Integer] timestamp in seconds of the PREVIOUS timestamp to this paragraph,
# to latest the timestamp sure not to miss beginning of paragraph.
attr_accessor :previous_timestamp

# @return [String] when the paragraph has no speaker name internally, we guess/assume
# it has the same speaker as previous paragraph. Store such an assumed speaker name
# from previous paragraph here.
attr_accessor :assumed_speaker_name

# OHMS transcript sub-classes get these from OHMS transcript model classes
attr_accessor :speaker_name, :text

def initialize(text:, paragraph_index:, speaker_name:)
@text = text
@paragraph_index = paragraph_index
@speaker_name = speaker_name
end

def word_count
@word_count ||= OralHistoryContent::OhmsXml::LegacyTranscript.word_count(text)
end
end
end
88 changes: 88 additions & 0 deletions app/services/oral_history/plain_text_paragraph_splitter.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
module OralHistory
# Takes a plain text OH transcript, such as included for searching in the OralHistoryContent#searchable_transcript_source
# field, and splits it into OralHistoryContent::Paragraph objects, for use by chunker.
#
# Note that this does not let us know what PDF page the paragraph is on, which might be nice
# for citing. And we have no timestamps, no sync timestamps in these transcripts.
#
# This may be a temporary interim implementation, to demo AI vector search of this content,
# while we figure out how we want to do better citing/linking.
#
class PlainTextParagraphSplitter
attr_reader :plain_text

def initialize(plain_text:)
@plain_text = plain_text
end

# @return OralHistoryContent::Paragraph
def paragraphs
@paragraphs ||= split_paragraphs
end

private

def split_paragraphs
last_speaker_name = nil
current_speaker_name = nil
paragraph_index = 0

things = trim_transcript(plain_text).split(/(?:\r?\n\s*){2,}/).collect do |raw_paragraph|
raw_paragraph.strip!

# There is some metadata that comes not only at beginning but sometimes in the middle
# after new tape/interview session. We don't want it.
next if looks_like_metadata_line?(raw_paragraph)

current_speaker_name = nil
# While this is not an OHMS transcript, the regex extracted from OHMS works well
if raw_paragraph =~ OralHistoryContent::OhmsXml::LegacyTranscript::OHMS_SPEAKER_LABEL_RE
current_speaker_name = $1.chomp(":")
end

paragraph = OralHistoryContent::Paragraph.new(speaker_name: current_speaker_name,
paragraph_index: paragraph_index,
text: raw_paragraph)
if paragraph.speaker_name.blank?
paragraph.assumed_speaker_name = last_speaker_name
end


last_speaker_name = current_speaker_name
paragraph_index +=1

paragraph
end.compact
end

def looks_like_metadata_line?(str)
# if it's one line, with one of our known metadata labels, colon, some info
str =~ /\A\s*(INTERVIEWEE|INTERVIEWER|DATE|LOCATION):.+$/ ||
# Also for now just avoid the [END OF ...] markers.
str =~ /\A\[END OF INTERVIEW.*\]\s*$/ ||
str =~ /\A\[END OF TAPE.*\]\s*$/
end

# Trim END after last [END OF INTERVEW] marker -- get rid of footnote and index.
def trim_transcript(plain_text)
plain_text = plain_text.dup

# we sometimes have unicode BOM and nonsense in there
plain_text.gsub!(/[\u200B\uFEFF]/, '')

# Interview often strip the LAST one in the transcript and anythi8ng after it
# , we'll use negative lookahead to be "last one, not another one after it"
if plain_text =~ /\[END OF INTERVIEW( \d+)?\]/
plain_text.gsub!(/\[END OF INTERVIEW( \d+)?\](?!.*\[END OF INTERVIEW).*/m, '')
elsif plain_text =~ /NOTES|INDEX/
# But sometimes they don't, but still have a NOTES and/OR INDEX? On a line by itself,
# eliminate with everything afterwords.
plain_text.gsub!(/^NOTES|INDEX$.*/m, '')
end

plain_text.strip!

plain_text
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ module OralHistory
# a few transcripts have multiple interviewers or intervieweees!), but that is just done
# approximately while also staying within a minimum and max word count for a chunk more strictly.
#
class OhmsLegacyTranscriptChunker
class TranscriptChunker
# always want more than this many words
LOWER_WORD_LIMIT = 260

Expand All @@ -23,7 +23,10 @@ class OhmsLegacyTranscriptChunker

EMBEDDING_RETRY_WAIT = 5

attr_reader :transcript, :interviewee_names, :oral_history_content
attr_reader :interviewee_names, :oral_history_content

# @attribute paragraphs [Array<OralHistoryContent::Paragraph>]
attr_reader :paragraphs


# @param allow_embedding_wait_seconds [Integer] if we exceed open ai rate limit for getting
Expand All @@ -34,12 +37,24 @@ def initialize(oral_history_content:, allow_embedding_wait_seconds: 0)
raise ArgumentError.new("argument must be OralHistoryContent, but was #{oral_history_content.class.name}")
end

unless oral_history_content.ohms_xml.legacy_transcript.present?
raise ArgumentError.new("#{self.class.name} can only be used with a LegacyTranscript, but argument does not have one: #{oral_history_content.inspect}")
end

@oral_history_content = oral_history_content
@transcript = oral_history_content.ohms_xml.legacy_transcript

# different ways of extracting paragraphs, they all should return array of OralHistoryContent::Paragraph
@paragraphs = if oral_history_content.ohms_xml&.legacy_transcript.present?
oral_history_content.ohms_xml.legacy_transcript.paragraphs

elsif oral_history_content.ohms_xml
# TODO, new style transcript
raise ArgumentError.new("#{self.class.name} can only be used with OHMS transcripts if they are legacy: #{oral_history_content.inspect}")

elsif oral_history_content.searchable_transcript_source.present?
OralHistory::PlainTextParagraphSplitter.new(
plain_text: oral_history_content.searchable_transcript_source
).paragraphs

else
raise ArgumentError.new("#{self.class.name} can't find paragraph source content for: #{oral_history_content.inspect}")
end

# For matching to speaker names, assume it's "lastname, first dates" type heading,
# take last name and upcase
Expand Down Expand Up @@ -133,7 +148,7 @@ def split_chunks
current_chunk = []
paragraph_speaker_name = nil

transcript.paragraphs.each do |paragraph|
paragraphs.each do |paragraph|
last_paragraph_speaker_name = paragraph_speaker_name

# only change speaker name if we have one, otherwise leave last one
Expand All @@ -158,8 +173,8 @@ def split_chunks
# one with the presumed question. end_with? is used for some weird "multi-interviewee with
# same name" use cases, good enough.
elsif prospective_count >= WORD_GOAL &&
!interviewee_names.find { |n| paragraph_speaker_name.end_with? n } &&
interviewee_names.find { |n| last_paragraph_speaker_name.end_with? n }
!interviewee_names.find { |n| paragraph_speaker_name&.end_with? n } &&
interviewee_names.find { |n| last_paragraph_speaker_name&.end_with? n }
chunks << current_chunk

overlap_paragraphs = (chunks.last || []).last(1)
Expand Down
Loading