-
Notifications
You must be signed in to change notification settings - Fork 15
/
Rakefile
147 lines (145 loc) · 5.83 KB
/
Rakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
rule "name-database/firstnames.csv" => "name-database/0717-182/nam_dict.txt" do
File.open("name-database/firstnames.csv", 'w') do |out|
out.write <<-HEAD
# List of first names, genders and country-specific frequencies
# This file is subject to the GNU Free Documentation License.
# Permission is granted to copy, distribute and/or modify
# this document under the terms of the GNU Free Documentation
# License, Version 1.2 or any later version published by the
# Free Software Foundation; with no Invariant Sections,
# no Front-Cover Texts, and no Back-Cover Texts.
#
# Copyright (c):
# 2007-2008: Jörg MICHAEL, Adalbert-Stifter-Str. 11,
# 30655 Hannover, Germany
# Updated 2016:
# Matthias Winkelmann, <[email protected]>
# https://matthi.coffee
# https://twitter.com/whereismatthi
# Github: https://github.com/MatthiasWinkelmann/name-database
#
# Original work at http://www.heise.de/ct/ftp/07/17/182/
#
#
# This file is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# A copy of the license can be found in the file GNU_DOC.TXT.
# You should have received a copy of the GNU Free Documentation
# License along with this file;
# if not, write to the Free Software Foundation, Inc.,
# 675 Mass Ave, Cambridge, MA 02139, USA.
#
# Gender is encoded in column 2 as:
#
# F female
# 1F female if first part of name, otherwise mostly male
# ?F mostly female
# M male
# 1M male if first part of name, otherwise mostly female
# ?M mostly male
# ? unisex
#
# Frequencies are encoded in (2^x)% of the population,
# i. e. a "-2" in column 31 indicates a frequency between 2^-4 = 0.0625% and 2^-3 = 0.125% of the population of Great Britain
name;gender;frequencies
;;Great Britain;Ireland;U.S.A.;Italy;Malta;Portugal;Spain;France;Belgium;Luxembourg;the Netherlands;East Frisia;Germany;Austria;Swiss;Iceland;Denmark;Norway;Sweden;Finland;Estonia;Latvia;Lithuania;Poland;Czech Republic;Slovakia;Hungary;Romania;Bulgaria;Bosnia and Herzegovina;Croatia;Kosovo;Macedonia;Montenegro;Serbia;Slovenia;Albania;Greece;Russia;Belarus;Moldova;Ukraine;Armenia;Azerbaijan;Georgia;Kazakhstan/Uzbekistan,etc.;Turkey;Arabia/Persia;Israel;China;India/Sri Lanka;Japan;Korea;Vietnam;other countries
HEAD
characters = {'<A/>' => ([256].pack("U*")),
'<a/>' => ([257].pack("U*")),
'<Â>' => ([258].pack("U*")),
'<â>' => ([259].pack("U*")),
'<A,>' => ([260].pack("U*")),
'<a,>' => ([261].pack("U*")),
'<C´>' => ([262].pack("U*")),
'<c´>' => ([263].pack("U*")),
'<C^>' => ([268].pack("U*")),
'<CH>' => ([268].pack("U*")),
'<c^>' => ([269].pack("U*")),
'<c^>' => ([269].pack("U*")),
'<d´>' => ([271].pack("U*")),
'<DJ>' => ([272].pack("U*")),
'<Ð>' => ([272].pack("U*")),
'<ð>' => ([273].pack("U*")),
'<dj>' => ([273].pack("U*")),
'<E/>' => ([274].pack("U*")),
'<e/>' => ([275].pack("U*")),
'<E°>' => ([278].pack("U*")),
'<e°>' => ([279].pack("U*")),
'<E,>' => ([280].pack("U*")),
'<e,>' => ([281].pack("U*")),
'<Ê>' => ([282].pack("U*")),
'<ê>' => ([283].pack("U*")),
'<G^>' => ([286].pack("U*")),
'<g^>' => ([287].pack("U*")),
'<G,>' => ([290].pack("U*")),
'<g´>' => ([291].pack("U*")),
'<I/>' => ([298].pack("U*")),
'<i/>' => ([299].pack("U*")),
'<I°>' => ([304].pack("U*")),
'<i>' => ([305].pack("U*")),
'<IJ>' => ([306].pack("U*")),
'<ij>' => ([307].pack("U*")),
'<K,>' => ([310].pack("U*")),
'<k,>' => ([311].pack("U*")),
'<L,>' => ([315].pack("U*")),
'<l,>' => ([316].pack("U*")),
'<L´>' => ([317].pack("U*")),
'<l´>' => ([318].pack("U*")),
'<L/>' => ([321].pack("U*")),
'<l/>' => ([322].pack("U*")),
'<N,>' => ([325].pack("U*")),
'<n,>' => ([326].pack("U*")),
'<N^>' => ([327].pack("U*")),
'<n^>' => ([328].pack("U*")),
'<Ö>' => ([336].pack("U*")),
'<ö>' => ([337].pack("U*")),
'Œ' => ([338].pack("U*")),
'<OE>' => ([338].pack("U*")),
'œ' => ([339].pack("U*")),
'<oe>' => ([339].pack("U*")),
'<R^>' => ([344].pack("U*")),
'<r^>' => ([345].pack("U*")),
'<S,>' => ([350].pack("U*")),
'<s,>' => ([351].pack("U*")),
'Š' => ([352].pack("U*")),
'<S^>' => ([352].pack("U*")),
'<SH>' => ([352].pack("U*")),
'<SCH>' => ([352].pack("U*")),
'<sh>' => ([353].pack("U*")),
'š' => ([353].pack("U*")),
'<s^>' => ([353].pack("U*")),
'<sch>' => ([353].pack("U*")),
'<T,>' => ([354].pack("U*")),
'<t,>' => ([355].pack("U*")),
'<t´>' => ([357].pack("U*")),
'<U/>' => ([362].pack("U*")),
'<u/>' => ([363].pack("U*")),
'<U°>' => ([366].pack("U*")),
'<u°>' => ([367].pack("U*")),
'<U,>' => ([370].pack("U*")),
'<u,>' => ([371].pack("U*")),
'<Z°>' => ([379].pack("U*")),
'<z°>' => ([380].pack("U*")),
'<Z^>' => ([381].pack("U*")),
'<z^>' => ([382].pack("U*")),
'<ß>' => ([3783].pack("U*"))}
frequencies = {'1' => -8, '2' => -7, '3' => -6, '4' => -5, '5' => -4, '6' => -3, '7' => -2, '8' => -1, '9' => 0, 'A' => 1, 'B' => 2, 'C' => 4, 'D' => 5}
File.open("name-database/0717-182/nam_dict.txt", 'r:iso-8859-1:utf-8').each do |line|
puts line
next if line[0] == '#'
next if line[0] == '='
next if line[29] == '+'
gender = line.split(' ').first
name = line.split(' ')[1].gsub(/<.+>/, characters)
countries = line[30,55].split('').map { |freq|
freq.empty? ? '' : frequencies[freq]
}
.join(';')
newline = [name, gender, countries].join(';') + "\n"
out.write(newline)
puts newline
end
end
end