-
Notifications
You must be signed in to change notification settings - Fork 0
/
indexer.rb
executable file
·105 lines (100 loc) · 3.78 KB
/
indexer.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env ruby
# -*- coding: utf-8 -*-
# $Id$
require "nkf"
require "yaml"
require "rubygems"
require "libxml"
begin
require 'sqlite3'
DBTYPE = SQLite3::Database
rescue LoadError
require 'dbi'
DBTYPE = DBI
end
require "database.rb"
require "harvester.rb"
module Fuwatto
class Util
def self.ja_char_normalizer( str )
NKF.nkf( "-tXZ1", str ).gsub( /\s+/, " " ).strip
end
def self.space_normalizer( str )
str.gsub( /\s+/, " " ).strip
end
end
end
if $0 == __FILE__
include Fuwatto
harvester = Harvester.new( "harvester.conf" )
harvester.sites.each do |site|
p site
logfile = open("log/indexer.#{site}.#{Time.now.strftime("%Y%m%d%H%M%S")}", "w")
Dir.glob( "#{ Harvester::HARVEST_DIR }/#{ site }/*.xml" ) do |file|
p file
parser = LibXML::XML::Parser.file( file )
doc = parser.parse
records = doc.find( "//oai:ListRecords/oai:record",
"oai:http://www.openarchives.org/OAI/2.0/" )
# p records.empty?
records.each do |record|
node = record.find( "oai:header/oai:identifier",
"oai:http://www.openarchives.org/OAI/2.0/" )
identifier = node.first.content
#p identifier
junii2 = record.find( "oai:metadata/ju:junii2",
["ju:http://ju.nii.ac.jp/junii2",
"oai:http://www.openarchives.org/OAI/2.0/" ])
if junii2.empty?
junii2 = record.find( "oai:metadata/irdb:junii2",
["irdb:http://irdb.nii.ac.jp/oai",
"oai:http://www.openarchives.org/OAI/2.0/" ])
end
if junii2.empty?
# ad-hoc workaround for Tulips-R
junii2 = record.find( "oai:metadata/irdb:meta",
["irdb:http://irdb.nii.ac.jp/dspace-oai",
"oai:http://www.openarchives.org/OAI/2.0/" ])
end
next if junii2.empty?
junii2.each do |md|
data = Hash.new( "" )
data[ "identifier" ] = identifier
md.children.each do |e|
case e.name
when "text"
next
when "description", "title", "jtitle", "creator", "alternative", "subject", "NIIsubject", "publisher", "contributor", "type", "source"
str = Util.space_normalizer( e.content )
if data[ e.name ].empty?
data[ e.name ] = str
else
data[ e.name ] << " " << str
end
end
end
db = Database.new( DBTYPE )
indexed_data = %w[ identifier description title jtitle creator alternative subject NIIsubject publisher contributor type source ].map{|e| data[e] }
db.transaction do
begin
sth = db.prepare("INSERT INTO md VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
sth.execute( 0, *indexed_data )
rescue SQLite3::SQLException => e
open( "error.tmp", "w" ) do |io|
io.puts indexed_data.inspect
io.puts indexed_data.size
end
raise e
end
logfile.puts [ data[ "identifier"], data[ "title" ] ].join("\t")
end
#p [ indexed_data, indexed_data.size ]
#rows = db.execute( "SELECT * from md" )
#p rows.size
#sleep 5
end
# puts "---"
end
end
end
end