Skip to content

Commit

Permalink
finished first pass at nokogiri datastream with solr indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
flyingzumwalt committed Jun 21, 2010
1 parent 965b369 commit 11fd295
Show file tree
Hide file tree
Showing 6 changed files with 248 additions and 49 deletions.
1 change: 1 addition & 0 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ begin
gem.add_dependency('mime-types', '>= 1.16')
gem.add_dependency('multipart-post')
gem.add_dependency('nokogiri')
gem.add_dependency('om', '>= 0.1.2')
gem.add_dependency('yaml')

gem.add_development_dependency "rspec", ">= 1.2.9"
Expand Down
39 changes: 39 additions & 0 deletions lib/active_fedora/nokogiri_datastream.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,44 @@ def to_solr(solr_doc = Solr::Document.new) # :nodoc:

return solr_doc
end

def solrize_accessor(accessor_name, accessor_info, opts={})
solr_doc = opts.fetch(:solr_doc, Solr::Document.new)
parents = opts.fetch(:parents, [])

accessor_pointer = parents+[accessor_name]

if accessor_info.nil?
accessor_info = self.class.accessor_info(accessor_pointer)
if accessor_info.nil?
raise "No accessor is defined for #{accessor_info.select}"
end
end

# prep children hash
child_accessors = accessor_info.fetch(:children, {})
xpath = self.class.accessor_xpath(*accessor_pointer)
nodeset = lookup(xpath)

nodeset.each do |node|
# create solr fields
solrize_node(node, accessor_pointer, solr_doc)
child_accessors.each_pair do |child_accessor_name, child_accessor_info|
solrize_accessor(child_accessor_name, child_accessor_info, opts={:solr_doc=>solr_doc, :parents=>parents+[{accessor_name=>nodeset.index(node)}] })
end
end

end

def solrize_node(node, accessor_pointer, solr_doc = Solr::Document.new)
generic_field_name_base = self.class.accessor_generic_name(accessor_pointer)
generic_field_name = generate_solr_symbol(generic_field_name_base, :text)

hierarchical_field_name_base = self.class.accessor_hierarchical_name(accessor_pointer)
hierarchical_field_name = generate_solr_symbol(hierarchical_field_name_base, :text)

solr_doc << Solr::Field.new(generic_field_name => node.text)
solr_doc << Solr::Field.new(hierarchical_field_name => node.text)
end

end
47 changes: 47 additions & 0 deletions lib/hydra_libs/mods_article.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
class ModsArticle < ActiveFedora::NokogiriDatastream

# have to call this in order to set namespace & schema
root_property :mods, "mods", "http://www.loc.gov/mods/v3", :attributes=>["id", "version"], :schema=>"http://www.loc.gov/standards/mods/v3/mods-3-2.xsd"

accessor :title_info, :relative_xpath=>'oxns:titleInfo', :children=>[
{:main_title=>{:relative_xpath=>'oxns:title'}},
{:language =>{:relative_xpath=>{:attribute=>"lang"} }}
]
accessor :abstract
accessor :topic_tag, :relative_xpath=>'oxns:subject/oxns:topic'
accessor :person, :relative_xpath=>'oxns:name[@type="personal"]', :children=>[
{:last_name=>{:relative_xpath=>'oxns:namePart[@type="family"]'}},
{:first_name=>{:relative_xpath=>'oxns:namePart[@type="given"]'}},
{:institution=>{:relative_xpath=>'oxns:affiliation'}},
{:role=>{:children=>[
{:text=>{:relative_xpath=>'oxns:roleTerm[@type="text"]'}},
{:code=>{:relative_xpath=>'oxns:roleTerm[@type="code"]'}}
]}}
]
accessor :organization, :relative_xpath=>'oxns:name[@type="institutional"]', :children=>[
{:role=>{:children=>[
{:text=>{:relative_xpath=>'oxns:roleTerm[@type="text"]'}},
{:code=>{:relative_xpath=>'oxns:roleTerm[@type="code"]'}}
]}}
]
accessor :conference, :relative_xpath=>'oxns:name[@type="conference"]', :children=>[
{:role=>{:children=>[
{:text=>{:relative_xpath=>'oxns:roleTerm[@type="text"]'}},
{:code=>{:relative_xpath=>'oxns:roleTerm[@type="code"]'}}
]}}
]
accessor :journal, :relative_xpath=>'oxns:relatedItem[@type="host"]', :children=>[
{:title=>{:relative_xpath=>'oxns:titleInfo/oxns:title'}},
{:publisher=>{:relative_xpath=>'oxns:originInfo/oxns:publisher'}},
{:issn=>{:relative_xpath=>'oxns:identifier[@type="issn"]'}},
{:date_issued=>{:relative_xpath=>'oxns:originInfo/oxns:dateIssued'}},
{:issue => {:relative_xpath=>"oxns:part", :children=>[
{:volume=>{:relative_xpath=>'oxns:detail[@type="volume"]'}},
{:level=>{:relative_xpath=>'oxns:detail[@type="level"]'}},
{:start_page=>{:relative_xpath=>'oxns:extent[@unit="pages"]/oxns:start'}},
{:end_page=>{:relative_xpath=>'oxns:extent[@unit="pages"]/oxns:end'}},
{:publication_date=>{:relative_xpath=>'oxns:date'}}
]}}
]

end
6 changes: 0 additions & 6 deletions lib/hydra_libs/mods_datastream.rb

This file was deleted.

90 changes: 90 additions & 0 deletions spec/fixtures/mods_articles/hydrangea_article1.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
<mods version="3.0" xsi:schemaLocation="http://www.loc.gov/mods/v3
http://www.loc.gov/standards/mods/v3/mods-3-0.xsd" xmlns="http://www.loc.gov/mods/v3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">

<titleInfo>
<nonSort>THE</nonSort>
<title>ARTICLE TITLE HYDRANGEA ARTICLE 1</title>
<subTitle>SUBTITLE</subTitle>
</titleInfo>
<titleInfo lang="finnish">
<title>Artikkelin otsikko Hydrangea artiklan 1</title>
</titleInfo>

<name type="personal">
<namePart type="family">FAMILY NAME</namePart>
<namePart type="given">GIVEN NAMES</namePart>
<namePart type="termsOfAddress">DR.</namePart>
<displayForm>NAME AS IT APPEARS</displayForm>
<affiliation>FACULTY, UNIVERSITY</affiliation>
<role>
<roleTerm authority="marcrelator" type="text">creator</roleTerm>
</role>
<role>
<roleTerm type="text">submitter</roleTerm>
</role>
</name>

<name type="personal">
<namePart type="family">Gautama</namePart>
<namePart type="given">Siddartha</namePart>
<namePart type="termsOfAddress">Prince</namePart>
<affiliation>Nirvana</affiliation>
<role>
<roleTerm authority="marcrelator" type="text">teacher</roleTerm>
</role>
</name>

<typeOfResource>text</typeOfResource>
<genre authority="local">journal article</genre>

<abstract>ABSTRACT</abstract>
<subject>
<topic>TOPIC 1</topic>
<topic>TOPIC 2</topic>
</subject>
<subject authority="AUTHORITY SOURCE (RFCD, LCSH)">
<topic>CONTROLLED TERM</topic>
</subject>

<language>
<languageTerm authority="iso639-2b" type="code">en-aus </languageTerm>
</language>

<physicalDescription>
<internetMediaType>application/pdf</internetMediaType>
<extent>36 p.</extent>
</physicalDescription>

<relatedItem type="host">
<titleInfo>
<title>TITLE OF HOST JOURNAL</title>
</titleInfo>
<originInfo>
<publisher>PUBLISHER</publisher>
<dateIssued>DATE</dateIssued>
</originInfo>
<identifier type="issn">0013-8908</identifier>
<part>
<detail type="volume">
<number>2</number>
</detail>
<detail type="level">
<number>2</number>
</detail>
<extent unit="pages">
<start>195</start>
<end>230</end>
</extent>
<date>FEB. 2007</date>
</part>
</relatedItem>

<identifier type="uri">http://URL.edu.au/</identifier>
<identifier type="doi">doi:10.1006/jmbi.1995.0238</identifier>
<location>
<url>http://URL.edu.au/</url>
</location>
<accessCondition type="restrictionOnAccess">EMBARGO NOTE</accessCondition>
<accessCondition type="use and reproduction">OPEN ACCESS</accessCondition>

</mods>
114 changes: 71 additions & 43 deletions spec/unit/nokogiri_datastream_spec.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
require File.join( File.dirname(__FILE__), "../spec_helper" )


describe ActiveFedora::NokogiriDatastream do

before(:all) do
Expand Down Expand Up @@ -117,36 +116,7 @@
doc = Solr::Document.new
@test_ds.to_solr(doc).should equal(doc)
end

it "should iterate through @fields hash" do
@test_ds.expects(:fields).returns(@sample_fields)
solr_doc = @test_ds.to_solr

solr_doc[:publisher_t].should == "publisher1"
solr_doc[:coverage_t].should == "coverage1"
solr_doc[:creation_date_dt].should == "fake-date"
solr_doc[:mydate_dt].should == "fake-date"

solr_doc[:empty_field_t].should be_nil
end

it "should allow multiple values for a single field"

it 'should append create keys in format field_name + _ + field_type' do
@test_ds.stubs(:fields).returns(@sample_fields)

#should have these

@test_ds.to_solr[:publisher_t].should_not be_nil
@test_ds.to_solr[:coverage_t].should_not be_nil
@test_ds.to_solr[:creation_date_dt].should_not be_nil

#should NOT have these
@test_ds.to_solr[:narrator].should be_nil
@test_ds.to_solr[:title].should be_nil
@test_ds.to_solr[:empty_field].should be_nil

end


it "should use Solr mappings to generate field names" do
ActiveFedora::SolrService.load_mappings(File.join(File.dirname(__FILE__), "..", "..", "config", "solr_mappings_af_0.1.yml"))
Expand All @@ -168,26 +138,84 @@
ActiveFedora::SolrService.load_mappings
end

it 'should append _dt to dates' do
@test_ds.expects(:fields).returns(@sample_fields).at_least_once
end

describe ".solrize_accessor" do
before(:all) do
class AccessorizedDs < ActiveFedora::NokogiriDatastream

root_property :mods, "mods", "http://www.loc.gov/mods/v3", :attributes=>["id", "version"], :schema=>"http://www.loc.gov/standards/mods/v3/mods-3-2.xsd"

accessor :title_info, :relative_xpath=>'oxns:titleInfo', :children=>[
{:main_title=>{:relative_xpath=>'oxns:title'}},
{:language =>{:relative_xpath=>{:attribute=>"lang"} }}
]
accessor :abstract
accessor :topic_tag, :relative_xpath=>'oxns:subject/oxns:topic'
accessor :person, :relative_xpath=>'oxns:name[@type="personal"]', :children=>[
{:last_name=>{:relative_xpath=>'oxns:namePart[@type="family"]'}},
{:first_name=>{:relative_xpath=>'oxns:namePart[@type="given"]'}},
{:institution=>{:relative_xpath=>'oxns:affiliation'}},
{:role=>{:children=>[
{:text=>{:relative_xpath=>'oxns:roleTerm[@type="text"]'}},
{:code=>{:relative_xpath=>'oxns:roleTerm[@type="code"]'}}
]}}
]
end
end

before(:each) do
file = fixture(File.join("mods_articles", "hydrangea_article1.xml"))
@accessorized_ds = AccessorizedDs.new(:blob=>file)
end

it "should perform a lookup and iterate over nodes in the result set calling solrize_node then calling solrize_accessor on any of the children, adding accessor_name & node index to parents array" do
mock_title_info_set = ["TI1", "TI2"]
mock_main_title_set = ["main title"]
mock_language_set = ["language"]

#should have these
solr_doc = Solr::Document.new

AccessorizedDs.expects(:accessor_xpath).with( :title_info ).returns("title_info_xpath")
@accessorized_ds.expects(:lookup).with( "title_info_xpath" ).returns(mock_title_info_set)

@test_ds.to_solr[:creation_date_dt].should_not be_nil
@test_ds.to_solr[:mydate_dt].should_not be_nil
mock_title_info_set.each do |tin|
node_index = mock_title_info_set.index(tin)
@accessorized_ds.expects(:solrize_node).with(tin, [:title_info], solr_doc)

# Couldn't mock the recursive calls to solrize_accessor without preventing the initial one, so was forced to mock out the whole recursive stack.
# @accessorized_ds.expects(:solrize_accessor).with(:main_title, AccessorizedDs.accessors[:title_info][:children][:main_title], :parents=>[{:title_info=>node_index}])
# @accessorized_ds.expects(:solrize_accessor).with(:language, AccessorizedDs.accessors[:title_info][:children][:language], :parents=>[{:title_info=>node_index}])
AccessorizedDs.expects(:accessor_xpath).with( {:title_info=>node_index}, :main_title ).returns("title_info_main_title_xpath")
AccessorizedDs.expects(:accessor_xpath).with( {:title_info=>node_index}, :language ).returns("title_info_language_xpath")
@accessorized_ds.expects(:lookup).with( "title_info_main_title_xpath" ).returns(mock_main_title_set)
@accessorized_ds.expects(:lookup).with( "title_info_language_xpath" ).returns(mock_language_set)
@accessorized_ds.expects(:solrize_node).with("main title", [{:title_info=>node_index}, :main_title], solr_doc)
@accessorized_ds.expects(:solrize_node).with("language", [{:title_info=>node_index}, :language], solr_doc)

end

#should NOT have these
@accessorized_ds.solrize_accessor(:title_info, AccessorizedDs.accessors[:title_info], :solr_doc=>solr_doc)

@test_ds.to_solr[:mydate].should be_nil
@test_ds.to_solr[:creation_date_date].should be_nil
end

it "should not call solrize_accessor once it reaches an accessor with no children accessors set" do
pending "not sure how to test for this"
@accessorized_ds.solrize_accessor(:text, AccessorizedDs.accessor_info( [{:person=>1}, :last_name] ), :parents=>[{:person=>1}])
end

it "should use values form parents array when requesting accessor_xpath and when generating solr field names" do
parents_array = [{:person=>0}, {:role=>1}]
AccessorizedDs.accessors[:person][:children][:role][:children][:text]

# This should catch the "submitter" roleTerm from the second role node within the first person node and put it into a solr field called "person_0_role_2_text_0_t" and a solr field called "person_role_text_t"
@accessorized_ds.solrize_accessor(:text, AccessorizedDs.accessor_info( *parents_array + [:text] ), :parents=>parents_array)

end
end

describe '.fields' do
it "should return a Hash" do
@test_ds.fields.should be_instance_of(Hash)
end
describe ".solrize_node" do
it "should create a solr field containing node.text"
end

end

0 comments on commit 11fd295

Please sign in to comment.