diff --git a/app/jobs/oh_transcript_chunker_job.rb b/app/jobs/oh_transcript_chunker_job.rb index be0825cb7..796062007 100644 --- a/app/jobs/oh_transcript_chunker_job.rb +++ b/app/jobs/oh_transcript_chunker_job.rb @@ -13,12 +13,7 @@ def perform(oral_history_content, delete_existing: false) end end - # check to make sure we are OHMS legacy, that's all we can do right now. - unless oral_history_content.ohms_xml.present? && oral_history_content.ohms_xml.legacy_transcript.present? - raise RuntimeError.new("We only know how to process legacy OHMS xml at present, can't process OralHistoryContent #{oral_history_content.id}") - end - - OralHistory::OhmsLegacyTranscriptChunker.new(oral_history_content: oral_history_content, allow_embedding_wait_seconds: 10).create_db_records + OralHistory::TranscriptChunker.new(oral_history_content: oral_history_content, allow_embedding_wait_seconds: 10).create_db_records end end diff --git a/app/models/oral_history_content/ohms_xml/legacy_transcript.rb b/app/models/oral_history_content/ohms_xml/legacy_transcript.rb index 03b342213..e217dfec7 100644 --- a/app/models/oral_history_content/ohms_xml/legacy_transcript.rb +++ b/app/models/oral_history_content/ohms_xml/legacy_transcript.rb @@ -211,28 +211,14 @@ def self.word_count(*strings) strings.collect { |s| s.scan(/\w+/).count }.sum end - # holds an ordered list of Line's, and can describe - class Paragraph + # subclass for describing our paragraphs, with extra behaivor that is line-based, + # as Legacy OHMS format is line-based. + class Paragraph < ::OralHistoryContent::Paragraph # @return [Array] ordered list of Line objects attr_reader :lines attr_reader :transcript_id - # @return [integer] 1-based index of paragraph in document - attr_reader :paragraph_index - - # @return [Array] list of timestamps (as seconds) included in ths paragraph - attr_accessor :included_timestamps - - # @return [Integer] timestamp in seconds of the PREVIOUS timestamp to this paragraph, - # to latest the timestamp sure not to miss beginning of paragraph. - attr_accessor :previous_timestamp - - # @return [String] when the paragraph has no speaker name internally, we guess/assume - # it has the same speaker as previous paragraph. Store such an assumed speaker name - # from previous paragraph here. - attr_accessor :assumed_speaker_name - def initialize(lines = nil, paragraph_index:) @lines = lines || [] @paragraph_index = paragraph_index @@ -250,28 +236,24 @@ def text @lines.collect {|s| s.text.chomp }.join(" ").strip end - def word_count - @word_count ||= OralHistoryContent::OhmsXml::LegacyTranscript.word_count(text) - end - # @return [Range] from first to last line number, with line numbers being 1-indexed # in entire document. def line_number_range (@lines.first.line_num..@lines.last.line_num) end - # @return [String] to be used as an anchor within an HTML doc, that can be targeted - # with a link - def fragment_id - "oh-t#{transcript_id}-p#{paragraph_index}" - end - # @return [String] speaker name from any speaker label. Can be nil. Assumes # whole paragraph is one speaker, identified on first line, which # SHOULD be true, but weird things may happen if it ain't. def speaker_name lines.first&.speaker_label&.chomp(":") end + + # @return [String] to be used as an `id` attribute within an HTML doc, identifying a particular + # paragraph. + def fragment_id + "oh-t#{transcript_id}-p#{paragraph_index}" + end end class Line diff --git a/app/models/oral_history_content/paragraph.rb b/app/models/oral_history_content/paragraph.rb new file mode 100644 index 000000000..d81e2474d --- /dev/null +++ b/app/models/oral_history_content/paragraph.rb @@ -0,0 +1,42 @@ +class OralHistoryContent + + # A model for an Oral History paragraph, used by the Chunker to make chunks. + # + # Different classes can create these, depending on what format of transcript they are creating + # from (OHMS, plain text, etc) in some cases there may be subsets. + # + # Some may create sub-classes specific to their format, but this is a general API for chunkers. + class Paragraph + attr_reader :transcript_id + + # @return [integer] 1-based index of paragraph in document + attr_reader :paragraph_index + + attr_reader :text + + # @return [Array] list of timestamps (as seconds) included in ths paragraph + attr_accessor :included_timestamps + + # @return [Integer] timestamp in seconds of the PREVIOUS timestamp to this paragraph, + # to latest the timestamp sure not to miss beginning of paragraph. + attr_accessor :previous_timestamp + + # @return [String] when the paragraph has no speaker name internally, we guess/assume + # it has the same speaker as previous paragraph. Store such an assumed speaker name + # from previous paragraph here. + attr_accessor :assumed_speaker_name + + # OHMS transcript sub-classes get these from OHMS transcript model classes + attr_accessor :speaker_name, :text + + def initialize(text:, paragraph_index:, speaker_name:) + @text = text + @paragraph_index = paragraph_index + @speaker_name = speaker_name + end + + def word_count + @word_count ||= OralHistoryContent::OhmsXml::LegacyTranscript.word_count(text) + end + end +end diff --git a/app/services/oral_history/plain_text_paragraph_splitter.rb b/app/services/oral_history/plain_text_paragraph_splitter.rb new file mode 100644 index 000000000..2c76517ca --- /dev/null +++ b/app/services/oral_history/plain_text_paragraph_splitter.rb @@ -0,0 +1,88 @@ +module OralHistory + # Takes a plain text OH transcript, such as included for searching in the OralHistoryContent#searchable_transcript_source + # field, and splits it into OralHistoryContent::Paragraph objects, for use by chunker. + # + # Note that this does not let us know what PDF page the paragraph is on, which might be nice + # for citing. And we have no timestamps, no sync timestamps in these transcripts. + # + # This may be a temporary interim implementation, to demo AI vector search of this content, + # while we figure out how we want to do better citing/linking. + # + class PlainTextParagraphSplitter + attr_reader :plain_text + + def initialize(plain_text:) + @plain_text = plain_text + end + + # @return OralHistoryContent::Paragraph + def paragraphs + @paragraphs ||= split_paragraphs + end + + private + + def split_paragraphs + last_speaker_name = nil + current_speaker_name = nil + paragraph_index = 0 + + things = trim_transcript(plain_text).split(/(?:\r?\n\s*){2,}/).collect do |raw_paragraph| + raw_paragraph.strip! + + # There is some metadata that comes not only at beginning but sometimes in the middle + # after new tape/interview session. We don't want it. + next if looks_like_metadata_line?(raw_paragraph) + + current_speaker_name = nil + # While this is not an OHMS transcript, the regex extracted from OHMS works well + if raw_paragraph =~ OralHistoryContent::OhmsXml::LegacyTranscript::OHMS_SPEAKER_LABEL_RE + current_speaker_name = $1.chomp(":") + end + + paragraph = OralHistoryContent::Paragraph.new(speaker_name: current_speaker_name, + paragraph_index: paragraph_index, + text: raw_paragraph) + if paragraph.speaker_name.blank? + paragraph.assumed_speaker_name = last_speaker_name + end + + + last_speaker_name = current_speaker_name + paragraph_index +=1 + + paragraph + end.compact + end + + def looks_like_metadata_line?(str) + # if it's one line, with one of our known metadata labels, colon, some info + str =~ /\A\s*(INTERVIEWEE|INTERVIEWER|DATE|LOCATION):.+$/ || + # Also for now just avoid the [END OF ...] markers. + str =~ /\A\[END OF INTERVIEW.*\]\s*$/ || + str =~ /\A\[END OF TAPE.*\]\s*$/ + end + + # Trim END after last [END OF INTERVEW] marker -- get rid of footnote and index. + def trim_transcript(plain_text) + plain_text = plain_text.dup + + # we sometimes have unicode BOM and nonsense in there + plain_text.gsub!(/[\u200B\uFEFF]/, '') + + # Interview often strip the LAST one in the transcript and anythi8ng after it + # , we'll use negative lookahead to be "last one, not another one after it" + if plain_text =~ /\[END OF INTERVIEW( \d+)?\]/ + plain_text.gsub!(/\[END OF INTERVIEW( \d+)?\](?!.*\[END OF INTERVIEW).*/m, '') + elsif plain_text =~ /NOTES|INDEX/ + # But sometimes they don't, but still have a NOTES and/OR INDEX? On a line by itself, + # eliminate with everything afterwords. + plain_text.gsub!(/^NOTES|INDEX$.*/m, '') + end + + plain_text.strip! + + plain_text + end + end +end diff --git a/app/services/oral_history/ohms_legacy_transcript_chunker.rb b/app/services/oral_history/transcript_chunker.rb similarity index 89% rename from app/services/oral_history/ohms_legacy_transcript_chunker.rb rename to app/services/oral_history/transcript_chunker.rb index b4289d8db..baa2ebd69 100644 --- a/app/services/oral_history/ohms_legacy_transcript_chunker.rb +++ b/app/services/oral_history/transcript_chunker.rb @@ -6,7 +6,7 @@ module OralHistory # a few transcripts have multiple interviewers or intervieweees!), but that is just done # approximately while also staying within a minimum and max word count for a chunk more strictly. # - class OhmsLegacyTranscriptChunker + class TranscriptChunker # always want more than this many words LOWER_WORD_LIMIT = 260 @@ -23,7 +23,10 @@ class OhmsLegacyTranscriptChunker EMBEDDING_RETRY_WAIT = 5 - attr_reader :transcript, :interviewee_names, :oral_history_content + attr_reader :interviewee_names, :oral_history_content + + # @attribute paragraphs [Array] + attr_reader :paragraphs # @param allow_embedding_wait_seconds [Integer] if we exceed open ai rate limit for getting @@ -34,12 +37,24 @@ def initialize(oral_history_content:, allow_embedding_wait_seconds: 0) raise ArgumentError.new("argument must be OralHistoryContent, but was #{oral_history_content.class.name}") end - unless oral_history_content.ohms_xml.legacy_transcript.present? - raise ArgumentError.new("#{self.class.name} can only be used with a LegacyTranscript, but argument does not have one: #{oral_history_content.inspect}") - end - @oral_history_content = oral_history_content - @transcript = oral_history_content.ohms_xml.legacy_transcript + + # different ways of extracting paragraphs, they all should return array of OralHistoryContent::Paragraph + @paragraphs = if oral_history_content.ohms_xml&.legacy_transcript.present? + oral_history_content.ohms_xml.legacy_transcript.paragraphs + + elsif oral_history_content.ohms_xml + # TODO, new style transcript + raise ArgumentError.new("#{self.class.name} can only be used with OHMS transcripts if they are legacy: #{oral_history_content.inspect}") + + elsif oral_history_content.searchable_transcript_source.present? + OralHistory::PlainTextParagraphSplitter.new( + plain_text: oral_history_content.searchable_transcript_source + ).paragraphs + + else + raise ArgumentError.new("#{self.class.name} can't find paragraph source content for: #{oral_history_content.inspect}") + end # For matching to speaker names, assume it's "lastname, first dates" type heading, # take last name and upcase @@ -133,7 +148,7 @@ def split_chunks current_chunk = [] paragraph_speaker_name = nil - transcript.paragraphs.each do |paragraph| + paragraphs.each do |paragraph| last_paragraph_speaker_name = paragraph_speaker_name # only change speaker name if we have one, otherwise leave last one @@ -158,8 +173,8 @@ def split_chunks # one with the presumed question. end_with? is used for some weird "multi-interviewee with # same name" use cases, good enough. elsif prospective_count >= WORD_GOAL && - !interviewee_names.find { |n| paragraph_speaker_name.end_with? n } && - interviewee_names.find { |n| last_paragraph_speaker_name.end_with? n } + !interviewee_names.find { |n| paragraph_speaker_name&.end_with? n } && + interviewee_names.find { |n| last_paragraph_speaker_name&.end_with? n } chunks << current_chunk overlap_paragraphs = (chunks.last || []).last(1) diff --git a/spec/services/oral_history/ohms_legacy_transcript_chunker_spec.rb b/spec/services/oral_history/ohms_legacy_transcript_chunker_spec.rb deleted file mode 100644 index d2944a2d7..000000000 --- a/spec/services/oral_history/ohms_legacy_transcript_chunker_spec.rb +++ /dev/null @@ -1,126 +0,0 @@ -require 'rails_helper' - -describe OralHistory::OhmsLegacyTranscriptChunker do - let(:ohms_xml_path) { Rails.root + "spec/test_support/ohms_xml/legacy/hanford_OH0139.xml"} - - let(:work) { - build(:oral_history_work, :ohms_xml, - ohms_xml_text: File.read(ohms_xml_path), - creator: [{ category: "interviewee", value: "Hanford, William E., 1908-1996"}, - { category: "interviewer", value: "Bohning, James J."}] - ) - } - let(:interviewee_speaker_label) { "HANFORD" } - - let(:oral_history_content) { work.oral_history_content } - let(:legacy_transcript) { oral_history_content.ohms_xml.legacy_transcript } - - let(:chunker) { described_class.new(oral_history_content: oral_history_content) } - - def word_count(*strings) - # use consistent word count algorithm - OralHistoryContent::OhmsXml::LegacyTranscript.word_count(*strings) - end - - describe "#split_chunks" do - let(:chunks) { chunker.split_chunks } - - it "creates chunks as arrays of Paragraphs" do - expect(chunks).to be_kind_of(Array) - expect(chunks).to be_present - - expect(chunks).to all satisfy { |chunk| chunk.kind_of?(Array) } - expect(chunks).to all satisfy { |chunk| chunk.present? } - expect(chunks).to all satisfy { |chunk| chunk.all? {|item| item.kind_of?(OralHistoryContent::OhmsXml::LegacyTranscript::Paragraph) } } - end - - it "begins with first paragraph" do - expect(chunks.first.first).to eq legacy_transcript.paragraphs.first - end - - it "ends with last paragraph" do - expect(chunks.last.last).to eq legacy_transcript.paragraphs.last - end - - it "has two paragraphs of overlap in each chunk" do - 0.upto(chunks.length - 2).each do |index| - first_chunk = chunks[index] - second_chunk = chunks[index + 1] - - expect(second_chunk.first(1)).to eq (first_chunk.last(1)) - end - end - - it "all chunks over minimum word count" do - # except possibly the last one, which just has what's left. - expect(chunks.slice(0, chunks.length - 1)).to all satisfy { |chunk| word_count(*chunk.collect(&:text)) >= described_class::LOWER_WORD_LIMIT } - end - - it "all chunks below max word count" do - # this is not technically invariant, if we have really long paragraphs it might be forced - # to go over, but it's true in this example. - expect(chunks).to all satisfy { |chunk| word_count(*chunk.collect(&:text)) <= described_class::UPPER_WORD_LIMIT } - end - - it "chunks mostly start with questioner" do - # third paragraph is the first uniquely new one, first two are overlap. We try - # to make that first unique one be the interviewer, not the interviewee. - # - # But it's definitely not invariant, depends on paragraph size, depends on transcript, with - # the smaller chunks we're using a lot of them wont' end "right", it's okay. - - interviewee_first_list = chunks.find_all { |chunk| chunk.third.speaker_name == interviewee_speaker_label } - - expect(interviewee_first_list.count.to_f / chunks.length).to be <= 0.20 - end - end - - describe "#build_chunk_record" do - let(:list_of_paragraphs) { legacy_transcript.paragraphs.slice(7, 5) } - - it "builds good record" do - record = chunker.build_chunk_record(list_of_paragraphs) - - expect(record).to be_kind_of(OralHistoryChunk) - expect(record.persisted?).to be false - expect(record.oral_history_content).to eq oral_history_content - - expect(record.embedding).to be nil - - expect(record.start_paragraph_number).to eq list_of_paragraphs.first.paragraph_index - expect(record.end_paragraph_number).to eq list_of_paragraphs.last.paragraph_index - expect(record.text).to eq list_of_paragraphs.collect(&:text).join("\n\n") - - expect(record.speakers).to eq ["HANFORD", "BOHNING"] - - # json standard says hash keys must be string, pg will insist - list_of_paragraphs.each do |paragraph| - timestamp_data = record.other_metadata["timestamps"][paragraph.paragraph_index.to_s] - expect(timestamp_data).to be_present - expect(timestamp_data["included"]).to eq paragraph.included_timestamps - expect(timestamp_data["previous"]).to eq paragraph.previous_timestamp - end - end - end - - describe "#create_db_records" do - # duarte is a nice short one (we don't really have it in OHMS, but works for short test) - let(:ohms_xml_path) { Rails.root + "spec/test_support/ohms_xml/legacy/duarte_OH0344.xml"} - - describe "with mocked OpenAI embeddings" do - before do - allow(OralHistoryChunk).to receive(:get_openai_embeddings) { |*args| [OralHistoryChunk::FAKE_EMBEDDING] * args.count } - end - - it "saves multiple records" do - chunker.create_db_records - - chunks = oral_history_content.reload.oral_history_chunks - - expect(chunks).to be_present - expect(chunks.first.start_paragraph_number).to eq 1 - expect(chunks.last.end_paragraph_number).to eq legacy_transcript.paragraphs.count - end - end - end -end diff --git a/spec/services/oral_history/plain_text_paragraph_splitter_spec.rb b/spec/services/oral_history/plain_text_paragraph_splitter_spec.rb new file mode 100644 index 000000000..bce6a0975 --- /dev/null +++ b/spec/services/oral_history/plain_text_paragraph_splitter_spec.rb @@ -0,0 +1,36 @@ +require 'rails_helper' + +describe OralHistory::PlainTextParagraphSplitter do + let(:raw_transcript_text) { File.read( Rails.root + "spec/test_support/ohms_xml/baltimore_plain_text_transcript_sample.txt")} + + let(:splitter) { described_class.new(plain_text: raw_transcript_text)} + let(:paragraphs) { splitter.paragraphs } + + it "splits into good paragraphs" do + expect(paragraphs).to be_present + + # skips prefatory metadata + expect(paragraphs.first.speaker_name).to eq "SCHLESINGER" + expect(paragraphs.first.text).to start_with "SCHLESINGER: Let’s not start at the beginning but" + + # gets the assumed speaker + expect(paragraphs[5].speaker_name).to eq nil + expect(paragraphs[5].assumed_speaker_name).to eq "BALTIMORE" + expect(paragraphs[5].text).to start_with "So I went out there for the summer" + + # skips more metadata + expect(paragraphs[8].speaker_name).to eq "SCHLESINGER" + expect(paragraphs[8].text).to eq "SCHLESINGER: When did you come to MIT as a faculty member?" + + + # Does not include back matter, last paragraph is last transcript paragraph + expect(paragraphs.last.speaker_name).to eq "BALTIMORE" + expect(paragraphs.last.text).to start_with "BALTIMORE: No, the new patent was the work of Mark Feinberg and Raul Andino" + + # does not include any double newlines + expect(paragraphs).to all satisfy { |p| ! (p.text =~ /\n\n/) } + + # no blank ones + expect(paragraphs).to all satisfy { |p| ! (p.text =~ /\A\s*\Z/) } + end +end diff --git a/spec/services/oral_history/transcript_chunker_spec.rb b/spec/services/oral_history/transcript_chunker_spec.rb new file mode 100644 index 000000000..7d81cd7e7 --- /dev/null +++ b/spec/services/oral_history/transcript_chunker_spec.rb @@ -0,0 +1,172 @@ +require 'rails_helper' + +describe OralHistory::TranscriptChunker do + let(:ohms_xml_path) { Rails.root + "spec/test_support/ohms_xml/legacy/hanford_OH0139.xml"} + let(:oral_history_content) { work.oral_history_content } + let(:chunker) { described_class.new(oral_history_content: oral_history_content) } + + def word_count(*strings) + # use consistent word count algorithm + OralHistoryContent::OhmsXml::LegacyTranscript.word_count(*strings) + end + + describe "OHMS Legacy Transcript" do + let(:work) { + build(:oral_history_work, :ohms_xml, + ohms_xml_text: File.read(ohms_xml_path), + creator: [{ category: "interviewee", value: "Hanford, William E., 1908-1996"}, + { category: "interviewer", value: "Bohning, James J."}] + ) + } + + let(:interviewee_speaker_label) { "HANFORD" } + let(:legacy_transcript) { oral_history_content.ohms_xml.legacy_transcript } + + describe "#split_chunks" do + let(:chunks) { chunker.split_chunks } + + it "creates chunks as arrays of Paragraphs" do + expect(chunks).to be_kind_of(Array) + expect(chunks).to be_present + + expect(chunks).to all satisfy { |chunk| chunk.kind_of?(Array) } + expect(chunks).to all satisfy { |chunk| chunk.present? } + expect(chunks).to all satisfy { |chunk| chunk.all? {|item| item.kind_of?(OralHistoryContent::Paragraph) } } + end + + it "begins with first paragraph" do + expect(chunks.first.first).to eq legacy_transcript.paragraphs.first + end + + it "ends with last paragraph" do + expect(chunks.last.last).to eq legacy_transcript.paragraphs.last + end + + it "has two paragraphs of overlap in each chunk" do + 0.upto(chunks.length - 2).each do |index| + first_chunk = chunks[index] + second_chunk = chunks[index + 1] + + expect(second_chunk.first(1)).to eq (first_chunk.last(1)) + end + end + + it "all chunks over minimum word count" do + # except possibly the last one, which just has what's left. + expect(chunks.slice(0, chunks.length - 1)).to all satisfy { |chunk| word_count(*chunk.collect(&:text)) >= described_class::LOWER_WORD_LIMIT } + end + + it "all chunks below max word count" do + # this is not technically invariant, if we have really long paragraphs it might be forced + # to go over, but it's true in this example. + expect(chunks).to all satisfy { |chunk| word_count(*chunk.collect(&:text)) <= described_class::UPPER_WORD_LIMIT } + end + + it "chunks mostly start with questioner" do + # second paragraph is the first uniquely new one, first one are overlap. We try + # to make that first unique one be the interviewer, not the interviewee. + # + # But it's definitely not invariant, depends on paragraph size, depends on transcript, with + # the smaller chunks we're using a lot of them wont' end "right", it's okay. + + interviewee_first_list = chunks.find_all { |chunk| chunk.third.speaker_name == interviewee_speaker_label } + + expect(interviewee_first_list.count.to_f / chunks.length).to be <= 0.20 + end + end + + describe "#build_chunk_record" do + let(:list_of_paragraphs) { legacy_transcript.paragraphs.slice(7, 5) } + + it "builds good record" do + record = chunker.build_chunk_record(list_of_paragraphs) + + expect(record).to be_kind_of(OralHistoryChunk) + expect(record.persisted?).to be false + expect(record.oral_history_content).to eq oral_history_content + + expect(record.embedding).to be nil + + expect(record.start_paragraph_number).to eq list_of_paragraphs.first.paragraph_index + expect(record.end_paragraph_number).to eq list_of_paragraphs.last.paragraph_index + expect(record.text).to eq list_of_paragraphs.collect(&:text).join("\n\n") + + expect(record.speakers).to eq ["HANFORD", "BOHNING"] + + # json standard says hash keys must be string, pg will insist + list_of_paragraphs.each do |paragraph| + timestamp_data = record.other_metadata["timestamps"][paragraph.paragraph_index.to_s] + expect(timestamp_data).to be_present + expect(timestamp_data["included"]).to eq paragraph.included_timestamps + expect(timestamp_data["previous"]).to eq paragraph.previous_timestamp + end + end + end + + describe "#create_db_records" do + # duarte is a nice short one (we don't really have it in OHMS, but works for short test) + let(:ohms_xml_path) { Rails.root + "spec/test_support/ohms_xml/legacy/duarte_OH0344.xml"} + + describe "with mocked OpenAI embeddings" do + before do + allow(OralHistoryChunk).to receive(:get_openai_embeddings) { |*args| [OralHistoryChunk::FAKE_EMBEDDING] * args.count } + end + + it "saves multiple records" do + chunker.create_db_records + + chunks = oral_history_content.reload.oral_history_chunks + + expect(chunks).to be_present + expect(chunks.first.start_paragraph_number).to eq 1 + expect(chunks.last.end_paragraph_number).to eq legacy_transcript.paragraphs.count + end + end + end + end + + describe "searchable_transcript_source plain text" do + let(:raw_transcript_text) { File.read( Rails.root + "spec/test_support/ohms_xml/baltimore_plain_text_transcript_sample.txt")} + + let(:work) { + build(:oral_history_work, + creator: [{ category: "interviewee", value: "Baltimore, David, 1938-"}, + { category: "interviewer", value: "Schlesinger, Sondra"}] + ).tap { |w| w.oral_history_content.searchable_transcript_source = raw_transcript_text } + } + + let(:interviewee_speaker_label) { "BALTIMORE" } + + + describe "#split_chunks" do + let(:chunks) { chunker.split_chunks } + let(:splitter) { OralHistory::PlainTextParagraphSplitter.new(plain_text: raw_transcript_text)} + + it "creates chunks as arrays of Paragraphs" do + expect(chunks).to be_kind_of(Array) + expect(chunks).to be_present + + expect(chunks).to all satisfy { |chunk| chunk.kind_of?(Array) } + expect(chunks).to all satisfy { |chunk| chunk.present? } + expect(chunks).to all satisfy { |chunk| chunk.all? {|item| item.kind_of?(OralHistoryContent::Paragraph) } } + end + + it "begins with first paragraph" do + expect(chunks.first.first.text).to eq splitter.paragraphs.first.text + end + + it "ends with last paragraph" do + expect(chunks.last.last.text).to eq splitter.paragraphs.last.text + end + + it "has two paragraphs of overlap in each chunk" do + 0.upto(chunks.length - 2).each do |index| + first_chunk = chunks[index] + second_chunk = chunks[index + 1] + + expect(second_chunk.first(1)).to eq (first_chunk.last(1)) + end + end + end + end +end diff --git a/spec/test_support/ohms_xml/baltimore_plain_text_transcript_sample.txt b/spec/test_support/ohms_xml/baltimore_plain_text_transcript_sample.txt new file mode 100644 index 000000000..894b4bee0 --- /dev/null +++ b/spec/test_support/ohms_xml/baltimore_plain_text_transcript_sample.txt @@ -0,0 +1,160 @@ + INTERVIEWEE: David Baltimore + +INTERVIEWER: Sondra Schlesinger + +DATE: 7 February 1994 + +LOCATION: Rockefeller University, New York + + + +SCHLESINGER: Let’s not start at the beginning but just at the end of college, the beginning of graduate school, and tell me a little bit about your major and how you got interested in biology. + + +BALTIMORE: Do you mean graduate school or when I became interested in biology? I did well in science in high school and my mother, noting that, asked whether I wanted to spend the summer at the Jackson Labs where they had a program for high-school students. So I went there between my junior and senior years in high school and had a wonderful experience in learning about science, biology, genetics, and doing experiments with three wonderful people—which is how they set up those programs—Elizabeth [Tibby] Russell, Will Silvers and Don Bailey. Actually, I first met Howard [M.] Temin at the Jackson Labs that summer. Then, when I went to college, the biology at Swarthmore [College] was terrible. It was extremely rote. The head of the department felt, almost ideologically, that it was inappropriate to have any biochemistry or molecular biology, although molecular biology didn’t have a name then; it was not really thought about. He said those things were for graduate school. It was important for an undergraduate to learn embryology, anatomy, and things like that. So I took those things, a little bit and I got sick of them. Actually I ended up a chemistry major because they let me do research. They were much more forward-looking people. But the reason I wanted to do biology was because of my high-school experience. Then in college I spent one summer—again between my junior and senior years—at Cold Spring Harbor with George Streisinger. That was a wonderful experience. There is a whole story about how I happened to do that, but it’s interesting. Actually I had even spent an earlier summer doing research at Mt. Sinai with a guy named Bob Lideen. So I knew what I wanted to do right from the start. I mean, by the time I was on my way to being a senior in high school. + + +SCHLESINGER: So you knew that you wanted to do research not medicine? + + +BALTIMORE: Right. Well, I sort-of took a pre-med curriculum in case I decided I wanted to go to medical school because that was one of those open options, but I never really thought about it terribly seriously. In the summer that I was in Cold Spring Harbor both [Salvador E.] Luria and Cy [Cyrus] Levinthal came through Cold Spring Harbor and approached me about coming to MIT [Massachusetts Institute of Technology] because at that time they were just getting the program going and they didn’t know where they were going to get students from. Here I was interested in the kinds of things that they thought somebody should be interested in. I did well in George’s lab so, basically, I never applied to go to graduate school except to go to MIT. It was a different world. + + +BALTIMORE: Well, I was attracted to viruses from the very beginning. You know, I hadn’t ever thought about this. When I was an undergraduate—I said I wouldn’t tell you this story about how I ended up in Cold Spring Harbor, but I will because it becomes relevant. I was taking a microbiology seminar, I guess in my junior year, and was reading all about phage and bacteria and I said, “Could we just look at a phage plaque?” I just wanted to know what we were talking about and the guy who ran the course actually worked on diatoms. This was at Swarthmore and the faculty all had little research programs. He didn’t know anything about bacteria or phage but he said, “If you can get the materials, we’ll do it.” He was a wonderful man. So during Easter recess of that year I went to Cold Spring Harbor, because I lived on Long Island. I lived in Great Neck and I knew Cold Spring Harbor because we used to go to the fish hatchery there as a weekend outing from home when I was a kid. So I connected the name. I’ll just go find a lab and talk to somebody and the only person I knew to talk to was Helen Gay who didn’t work on bacteria or bacteriophage but her name turned up a lot in cytology literature. I was kind of interested in cytology. So I sought her out. I just drove out there or maybe I called and she said, “I can’t help you but I think George Streisinger could,” and I had never heard his name before. George is one of those enigmatic figures to most people because he was so much a scientist. He published almost nothing. So I went upstairs and there was George sitting behind his desk in leather thongs, drinking Coca-Cola, and looking like something that just came off the moon. I was both amazed and enthralled by him. Sure he would give me the stuff. We started talking and I guess I must have impressed him because he said to me, “Would you like to come out here this summer and work with me?” I said, “Yes, I would love to.” But I had already made a commitment to go to Mt. Sinai to continue working with the people I had worked with before. They worked in parasitology. I can’t remember which one. I think that was what the commitment was. So I called whomever I was committed to and said, “They have offered me a job in Cold Spring Harbor and I don’t know what to do about.” Whoever it was said, “Go to Cold Spring Harbor. You’d have a wonderful time here, but that’s a rare opportunity.” And so I did. It was the first year of the URPP [undergraduate research participation program] but the reason George asked me was they had this money from NSF [National Science Foundation] and they didn’t have any students yet. They had the money to train college students and so I was in the first URPP class. + + So I went out there for the summer and I worked with George asking whether T4 [bacteriophage]—T4 I think—required DNA synthesis in order to recombine. I believe that was the question and we were blocking replication and were looking at whether recombinants came out. I don’t even remember what answer we got but I do remember the last thing we had to do was some P32 experiments to see if we were really blocking replication with—I think was drugs we used—and that was kind of messy. I had no experience with that but I managed to get some numbers out of it. The last day, I was there and George said, “Well I don’t know if we’ll get to publish this or not”—it was never published like most of what happened in George’s lab, but the key was to go downstairs and to present the work to [Jun-ichi] Tomizawa, who was then in [Alfred D.] Hershey’s lab, because Tomizawa was the smartest man in the world. If Tomisawa thought the experiments were good experiments, then they were good experiments. So we did that. Tomizawa, I think, liked the experiments—I’m not sure he understood them. + + +SCHLESINGER: Did you go to the lectures that they had in conjunction with the courses? + + +BALTIMORE: Oh, sure I went to lectures all the time. By that time—this is between my junior and senior years of college—I knew a fair amount of molecular biology, almost all self-taught. I mean, we haven’t discussed college, but actually, in my senior year of college I taught a molecular biology course because there were so many undergraduates who wanted to know something about it. As I said, the faculty was more or less opposed to it. So I just did it for interested people and we would meet once a week. I knew more than they did, but only barely. And what I knew was, of course, very unsystematic and very unfiltered by anybody who really knew what they were talking about and therefore confused and spotty. + + +[END OF INTERVIEW 1] +INTERVIEWEE: David Baltimore + +INTERVIEWER: Sondra Schlesinger + +DATE: 13 April 1995 + +LOCATION: Massachusetts Institute of Technology + + + +SCHLESINGER: When did you come to MIT as a faculty member? + + +BALTIMORE: I came in 1968 arriving here on a cold day in January, the first of January. + + +SCHLESINGER: What were the arrangements that had been made for your coming here, what was the package that you were getting? + + +BALTIMORE: I never worried much about it. I had been at MIT before and I knew the people I was dealing with. They just said they would take care of me and I really didn’t negotiate for anything. In fact, I remember kind of jokingly saying that the only thing I really cared about was that I got a decent parking space. + + +SCHLESINGER: Actually more than one would have expected, because I think it took a long time before people could find viruses. + + +BALTIMORE: It was very slow to develop the generality of it and also the extent of the mouse genome, the human genome too, better known than the mouse genome, that comes from reverse transcription which is something between one and 10 percent of the genome comes about by reverse transcription. All these pseudogenes as well as the retrotransposons and that is an enormous amount. It’s a very wide spread process but it is still doesn’t really allow for fixation in the germ line of events that have occurred. + + +[END OF TAPE, SIDE 2] + + +SCHLESINGER: David we are in 1971, you had made the discovery of reverse transcriptase. + + +[END OF TAPE, SIDE 4] + + +[END OF INTERVIEW 2] + +INTERVIEWEE: David Baltimore + +INTERVIEWER: Sondra Schlesinger + +DATE: 29 April 1995 + +LOCATION: David Baltimore’s home, Boston, MA + + + +BALTIMORE: Alice, when did we decide to go on sabbatical in New York? + + +HUANG: We went 1975 or 1976. + + +BALTIMORE: I know but when did we decide to go and why? + + +HUANG: When and why? You were due for a sabbatical and I had a RCDA [Research Career Development Award] for my 4th year. + + +BALTIMORE: Anyway, so that’s what I remember, we just kind of decided to take a sabbatical. + + +SCHLESINGER: All right. You had mentioned that you were getting interested in immunology. + + +HUANG: You were looking around because you were thinking of going into immunology. + + +BALTIMORE: Right, I remember. Fundamentally, the original impetus to take a sabbatical came from the idea of becoming more involved in immunology. Also it was convenient for Alice, so we committed ourselves. Then I started looking for a place, presumably in the winter of 1974-1975, in there. I talked to people in Australia and in Basel. I could have done either one, but I guess we finally decided it was just too far away—we had a small baby and we weren’t sure whether we wanted to take her out of the country. I had a big lab and lots going on and I didn’t know how far away I wanted to be. So it all became a very convenient sort-of compromise to go to New York. Alice had a good place to work in New York with Purnell Choppin. Jim Darnell and I had remained close from the time that I had been in his lab at MIT, and then I was sort of with him at Einstein. So I decided to do something that had no real learning component to it, but was just a change of venue. I figured I could also learn some things. In the end, I did learn some things from working with Warren Jellinek, whom I ended up doing some experiments with. He was a young assistant professor in Jim’s unit and was extremely, technically very knowledgeable and interested in interesting things. + + +SCHLESINGER: So, in fact, you changed the idea. There was no longer any focus on immunology. + + +BALTIMORE: There was no longer a focus on immunology. I figured that I could kind of absorb that if I needed to and that New York was as good a place as any, although there wasn’t anybody there with whom I could learn. Of course the Nobel Prize came that fall just after I arrived in New York. That pretty well undermined a lot of the time that I had on the sabbatical. + + +SCHLESINGER: So why don’t you describe a little bit about what that day was like. + + +BALTIMORE: Well, this was a remarkable kind of situation. I came back from the Soviet Union. I had gone with a group from the National Academy of Sciences on a trip to the Soviet Union. We had gone to Moscow and to Tashkent. Actually, the meeting was held in Kiev and then we went off to Tashkent, to Bukara and came back. It was an incredible experience as it was for anybody in those days who went to the Soviet Union—to discover how backward it was, how poor it was, how difficult it was to do science. How paranoid everybody was about the secret police, but how wonderful the people were all as individuals; extremely well educated, thoughtful. The man I developed the strongest admiration for was Vadim Agol, whom we know is a great scientist and who had devoted his life to doing science in the Soviet Union. Had he been able to work and travel outside the Soviet Union, he would be one of the world’s great scientists today. As it is, he is highly admired in the virology community but really not known by anybody else. I got to know some of his friends and late at night in Kiev as we were drinking more and more vodka, a lot of these guys would open up, some of them terribly irresponsibly, but they felt that the place that we were living, which was a student hostel in Kiev, was a safe place to talk. So it had been a very intense, very exciting, very politically demoralizing experience because if this is what the Left meant then, a lot of people have been kidding themselves for a long time about directions of politics. So it was complicated. + + I came back and moved immediately to New York, or maybe we had moved already. I can’t remember. I came back to New York and we got settled, I think we just moved down right away and we got settled in the apartment at Rockefeller and then spent a little time getting settled in the lab. We brought a housekeeper with us and were getting organized in New York. Right in the beginning of October, Alice went to a meeting in Denmark. It was a meeting on comparative leukemia and she had done some work on VSV pseudotypes that was relevant to that. It was kind of amusing because it was a meeting that I, in principle, should have been at. Although, I had decided not to go. We couldn’t both go away at that point anyway, and I knew I was traveling plenty; I didn’t need that meeting. Howard Temin might have been there but wasn’t. So the meeting took place and I was living in New York. What happened was that the session Alice was talking in was presumably on a Monday morning, whenever it is the Nobel Prize is announced. George Klein was the organizer of the session and at the end of the session, George got up and said, “I want to summarize this session,” and went on, as Alice said, endlessly, summarizing unnecessarily this whole session. He finally said, “Look the Nobel Prize is going to be announced in a half an hour, but I can’t keep talking for a half an hour. I’ve got to tell you who is going to win the Nobel Prize.” He announced that Howard and I were going to win the Nobel Prize that year. So Alice ran to the phone and called me and woke me at 6:30 in the morning or something in New York. It was about noon in Denmark. So I’m probably the only person in history who learned that they won the Nobel Prize from his wife. She woke me out of a sound sleep and told me this. I laid back after the conversation - + + +SCHLESINGER: You mentioned the work on the heterologous sequences, but did that begin in your lab—the introduction of heterologous sequences into polio—or is that just something you’re part of because of your interactions with Vinney? + + +BALTIMORE: No, the new patent was the work of Mark Feinberg and Raul Andino (34). What they showed was that you could put another gene into polio by inserting it right at the beginning of VP4, right at the beginning of the polyprotein, and putting a cleavage site in there. That it is cleaved off by the protease, and works better than any other method of putting genes into polio. Actually that was their idea. I am not on that patent, come to think of it, because they really had thought of it, done it, and carried it through. + + +[END OF TAPE, SIDE 3] + + +[END OF INTERVIEW 3] + +NOTES + +1. John T. Edsall and Jeffries Wyman. Biophysical Chemistry (New York: Academic Press, 1958). + +2. Salvador E. Luria. General Virology (New York: Wiley Publishing, 1953). + +3. D. Baltimore and R. M. Franklin, “The effect of mengovirus infection on the activity of the DNA-dependent RNA polymerase of L-cells,” Proceedings of the National Academy of Science, U. S. 48 (1962): 1383-1390. + +4. R. M. Franklin and D. Baltimore, “Patterns of macromolecular synthesis in normal and virus-infected mammalian cells,” Cold Spring Harbor Symposium Quant. Biol. 27 (1962): 175-198. + +5. D. Baltimore and R. M. Franklin, “Preliminary data on a virus-specific enzyme system responsible for the synthesis of viral RNA,” Biochemical and Biophysical Research Communications 9 (1962): 388-392. + +6. D. Baltimore and R. M. Franklin, “Properties of the mengovirus and poliovirus RNA polymerases,” Cold Spring Harbor Symposium Quant. Biol. 28 (1963): 105-108. + +7. D. Baltimore, R. M. Franklin, H. J. Eggers, and I. Tamm, “Poliovirus induced RNA polymerase and the effects of virus-specific inhibitors on its production,” Proceedings of the National Academy of Science, U. S. 49 (1963): 843-849. + +8. David Baltimore, “The Diversion of Macromolecular Synthesis in L-cells towards Ends Dictated by Mengovirus,” Doctoral dissertation. The Rockefeller University, 1964, 86 pages. + +9. Bernard D. Davis, et al. Principles of Microbiology and Immunology (New York: Harper and Row, 1968). + + 10. D. Baltimore, Y, Becker, and J. E. Darnell, “Virus-specific double-stranded RNA in poliovirus-infected cells,” Science 143 (1964): 1034-1036. + + 11. D. Rekosh, H. F. Lodish, and D. Baltimore, “Translations of poliovirus RNA by an E. coli cell-free system,” Cold Spring Harbor Symposium Quant. Biol. 34 (1969): 747-751. + + D. Rekosh, H. F. Lodish, and D. Baltimore, “Protein synthesis in Escherichia coli extracts programmed by poliovirus RNA,” Journal of Molecular Biology 54 (1970): 327-340. + + 12. Lydia Villa-Komaroff. “Translation of Poliovirus RNA in Eukaryotic Cell-free Systems.” Dissertation. Massachusetts Institute of Technology, 1975. +