This repository has been archived by the owner on Feb 26, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapable_classes.rb
87 lines (74 loc) · 1.57 KB
/
scrapable_classes.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# coding: utf-8
require 'rubygems'
require 'nokogiri'
# require 'rest-client'
require 'httparty'
require 'pdf-reader'
require 'open-uri'
require 'awesome_print'
module RestfulApiMethods
@model = ''
@API_url = ''
def format info
info
end
def put formatted_info
HTTParty.put [@API_url, @model, @id].join("/"), formatted_info
end
end
class StorageableInfo
include RestfulApiMethods
def initialize(location = '')
# @API_url = 'http://billit.ciudadanointeligente.org'
@API_url = 'http://billit.congresointeractivo.org'
@location = location
end
def process opts={}
@options = opts
f = File.open('scraping_errors.txt', 'a')
doc_locations.each do |doc_location|
# begin
#puts doc_location
doc = read doc_location
puts '#read'
info = get_info doc
puts '#got'
formatted_info = format info
puts '#formatted'
save formatted_info
puts '#saved'
## rescue Exception=>e
# f.puts "EXCEPTION"
# f.puts doc_location
# f.puts e
# puts e
# end
end
if (@total_pages.to_i > @page)
process(opts)
end
end
def read location = @location
# it would be better if instead we used
# mimetype = `file -Ib #{path}`.gsub(/\n/,"")
if location.class.name != 'String'
doc = location
elsif !location.scan(/pdf/).empty?
doc_pdf = PDF::Reader.new(open(location))
doc = ''
doc_pdf.pages.each do |page|
doc += page.text
end
else
doc = open(location).read
end
doc
end
#----- Undefined Functions -----
def doc_locations
[@location]
end
def get_info doc
doc
end
end