-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig_mac.yaml
131 lines (126 loc) · 4.31 KB
/
config_mac.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
---
# application main params
general:
# where all paths and following files are located
basedirectory: .
locale: en_US.UTF-8
# following directories or files a under "basedirectory"
index: index
dbenv: db
# log options, file is "pytextminer-log.txt"
logsize: 1024000
loglevel: debug
user: /Applications/Tinasoft/sessions
shared: /Applications/Tinasoft/shared
whitelist_directory: /Applications/Tinasoft/whitelists
source_file_directory: /Applications/Tinasoft/source_files
userstopwords: /Applications/Tinasoft/shared/user_stopwords.csv
userwhitelist: /Applications/Tinasoft/shared/user_whitelist.csv
# stopwords are under share
stopwords: stopwords/en.txt
# extraction settings
datasets:
doc_extraction:
# the following values can be document object fields:
# - defined by one of the field's value
# - constants required fields : 'content'/'label'/'id'
- title
- content
#- keywords
# tina csv columns declaration
# will ignore undeclared fields
# and warn not found optional fields
tinacsv:
# doc_label represents one of the field's key
# if not found in the file, will use the field specified by "label"
doc_label: acronym
fields:
# required fields
label: doc_id
content: abstract
corpus_id: corp_id
id: doc_id
# optionnal fields
title: title
acronym: acronym
keywords: keywords
# csv reader params
encoding: utf_8
#
# CSV dialects.
#
# Possible choices:
# auto, excel, excel-tab, excel-comma, excel-semicolon
#
# auto: will try to detect when importing a csv file
# excel: Microsoft Excel will use current LOCALE to determine either "," or ";"
# excel-tab: tab-separated
# excel-comma: comma-separated
# excel-semicolon: semicolon-separated
#
# Guidelines:
#
# dialect_read: used for importing a CSV file. Maybe you want to leave this to "auto"
# except if your spreadsheet messed something, and indexing fails
#
# dialect_write: used to write to a CSV file. If "auto" don't work, you HAVE to edit this.
dialect_read: auto
dialect_write: auto
# pubmed.gov "medline" file export
medline:
# doc_label represents one of the field's key
# if not found in the file, will use the field specified by "label"
doc_label: title
fields:
label: PMID
content: AB
corpus_id: DP
id: PMID
# optional fields
title: TI
# period_size = number first characters of pub date field "corpusField"
period_size: 4
encoding: ascii
# archive of pubmed.gov "medline" files, organized like this
# Medline/
# period1/period1.txt
# period2/period2.txt
#medlinearchive:
# TODO
# extraction size
ngramMin: 1
ngramMax: 4
# tagger learning on 2 x training_tagger_size sentences of tagged nltk corpus
# to train on the whole corpora, mark ~ instead of a number of phrases
training_tagger_size: ~
# under "basedirectory" directory, delete this file to regenerate a new one on next starting
tagger: /Applications/Tinasoft/shared/tagger.pickle
# change this Reg Expression to change NGram extraction filtering
postag_valid: '^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?(CD.,)??)+?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?)+?)*?$'
datamining:
template: /Applications/Tinasoft/shared/gexf/gexf.default.template
# default values if no params are passed to gexf exporter
DocumentGraph:
edgethreshold:
- 0.0001
- inf
nodethreshold:
- 1.00
- inf
#proximity: sharedNGrams
proximity: logJaccard
maxdegree: 100
NGramGraph:
edgethreshold:
- 0.0001
- inf
nodethreshold:
- 1.00
- inf
alpha: 0.10
#hapax: 1
proximity: Cooccurrences
#proximity: EquivalenceIndex
#proximity: PseudoInclusion
indexer:
minCooc: 10