-
Notifications
You must be signed in to change notification settings - Fork 1
/
scratch.jl
executable file
·42 lines (31 loc) · 1009 Bytes
/
scratch.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
using DataFrames
using CSV
using PooledArrays
import Pkg
Pkg.add(url="https://github.com/jw2249a/FastLink.jl")
using FastLink
using JSON
a_fil = Pkg.Artifacts.@artifact_str "dfA"
b_fil = Pkg.Artifacts.@artifact_str "dfB"
dfA=CSV.read("$(a_fil)/dfA.csv", DataFrame,
ntasks=1,
pool=true,
missingstring=["", "NA"])
dfB=CSV.read("$(b_fil)/dfB.csv", DataFrame,
ntasks=1,
pool=true,
missingstring=["", "NA"])
config = JSON.parsefile("test_parameters.json")
dfA.id = hash.(eachrow(dfA))
dfB.id2 = hash.(eachrow(dfB))
varnames=["firstname","middlename", "lastname","housenum"]
for var in varnames
if eltype(dfA[:,var]) <: AbstractString
dfA[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfA[:,var]))
dfB[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfB[:,var]))
else
dfA[!,var] = Vector(dfA[!,var])
dfB[!,var] = Vector(dfB[!,var])
end
end
result=fastLink(dfA, dfB, config)