-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeatures.py
149 lines (104 loc) · 3.92 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import math
import string
from nltk.corpus import stopwords
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from nltk import stem
def question_headline( headline):
keywords = [
'?', 'should', 'can', 'if',
'is', 'would', 'why', 'how',
'when', 'where'
]
if any(word in headline for word in keywords):
return 1
else:
return 0
def religion_headline( headline):
keywords = [
"secular","humanist","humanism","god",
"religion", "atheis", "islam", "church",
"jesus", "christ", "catholic", "pope",
"imam", "isis", "muslim","gay", "marriage",
"israel","jew","extremist", "fundamentalism",
"terror", "hindu"
]
if any(word in headline for word in keywords):
return 1
else:
return 0
def word_log( wordcount):
return math.log1p(float(wordcount))
def political_headline( headline):
keywords = [
'obama', 'clinton', 'trump', 'election',
'president', 'senate', 'supreme court'
]
if any(word in headline for word in keywords):
return 1
else:
return 0
def socialmedia_headline( headline):
keywords = [
'facebook', 'instagram', 'snapchat',
'twitter', 'tweet', 'hashtag'
]
if any(word in headline for word in keywords):
return 1
else:
return 0
def dayofweek( date):
return date.day
def preprocess_headline(headline):
snowball = stem.snowball.EnglishStemmer()
headline = headline.lower()
headline = "".join(l for l in headline if l not in string.punctuation)
headline = ' '.join([word for word in headline.split() if word not in stopwords.words("english")])
headline = ' '.join([snowball.stem(word) for word in headline.split()])
return headline
def registerFunctions(sqlContext):
sqlContext.udf.register("question_headline", question_headline, IntegerType())
sqlContext.udf.register("religion_headline", religion_headline, IntegerType())
sqlContext.udf.register("word_log", word_log, FloatType())
sqlContext.udf.register("political_headline", political_headline, IntegerType())
sqlContext.udf.register("socialmedia_headline", socialmedia_headline, IntegerType())
sqlContext.udf.register("dayofweek", dayofweek, IntegerType())
sqlContext.udf.register("preprocess_headline", preprocess_headline, StringType())
def featurecreation(df, tblname, sqlContext):
df.registerTempTable(tblname)
if 'popular' in df.columns:
return sqlContext.sql("""SELECT
uniqueid,
preprocess_headline(headline) as process_headline,
CAST(popular as double) as label,
CASE WHEN newsdesk='' THEN 'NA' ELSE newsdesk END newsdesk,
CASE WHEN sectionname='' THEN 'NA' ELSE sectionname END sectionname,
CASE WHEN subsectionname='' THEN 'NA' ELSE subsectionname END subsectionname,
question_headline(headline) as question,
religion_headline(headline) as religion,
political_headline(headline) as political,
socialmedia_headline(headline) as socialmedia,
word_log(wordcount) as wordcount_norm,
dayofweek(pubdate) as dowpub,
hour(pubdate) as hourpub
FROM {0}""".format(tblname))
else:
return sqlContext.sql("""SELECT
preprocess_headline(headline) as process_headline,
CAST(uniqueid as int) as uniqueid,
CAST(wordcount as int) as wordcount,
headline,
snippet,
abstract,
CAST(pubdate as timestamp) as pubdate,
CASE WHEN newsdesk='' THEN 'NA' ELSE newsdesk END newsdesk,
CASE WHEN sectionname='' THEN 'NA' ELSE sectionname END sectionname,
CASE WHEN subsectionname='' THEN 'NA' ELSE subsectionname END subsectionname,
question_headline(headline) as question,
religion_headline(headline) as religion,
political_headline(headline) as political,
socialmedia_headline(headline) as socialmedia,
word_log(wordcount) as wordcount_norm,
dayofweek(CAST (pubdate as timestamp)) as dowpub,
hour(pubdate) as hourpub
FROM {0} """.format(tblname))