Skip to content

Commit 9b561e8

Browse files
committedApr 12, 2014
Solution to the first problem in Lesson 4: Mapreduce Design Patterns - Filtering Patterns
1 parent 046f12d commit 9b561e8

File tree

3 files changed

+98
-0
lines changed

3 files changed

+98
-0
lines changed
 

‎Datasets/forum_data.tar.gz

37.8 MB
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/python
2+
import sys
3+
import csv
4+
import re
5+
6+
# To run this code on the actual data, please download the additional dataset.
7+
# You can find instructions in the course materials (wiki) and in instructor comments.
8+
# There are some things in this data file that are different from what you saw
9+
# in Lesson 3. This dataset is more complicated, and closer to what you
10+
# would see in the real world. It was generated by exporting data from
11+
# an SQL database.
12+
# Since the data in at least one of the fields (the body field) can include new lines,
13+
# and all the fields are enclosed in double quotes,
14+
# you should use a less naive way of processing the data file (instead of split(",")).
15+
# We have provided sample code on how to use the csv module of Python.
16+
# "line" in this case will be an array that contains all the fields
17+
# similar to using split in the previous lesson.
18+
###########################################################################
19+
# In this exercise you are interested in the field "body" which is the 5th field.
20+
# Find forum nodes where "body" contains only one sentence.
21+
# We define sentence as a "body" that contains either none of the following
22+
# 3 punctuation marks ".!?" , or only one of them as the last character in the body.
23+
# You should not parse the HTML inside body, or pay attention to new lines.
24+
25+
26+
def mapper():
27+
reader = csv.reader(sys.stdin, delimiter='\t')
28+
writer = csv.writer(sys.stdout, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)
29+
30+
for line in reader:
31+
body = line[4]
32+
body_without_last_char = body[:-1]
33+
if len(re.findall("\.|\?|!", body_without_last_char)) > 0:
34+
continue
35+
else:
36+
writer.writerow(line)
37+
38+
39+
test_text = """\"\"\t\"\"\t\"\"\t\"\"\t\"This is one sentence\"\t\"\"
40+
\"\"\t\"\"\t\"\"\t\"\"\t\"Also one sentence!\"\t\"\"
41+
\"\"\t\"\"\t\"\"\t\"\"\t\"Hey!\nTwo sentences!\"\t\"\"
42+
\"\"\t\"\"\t\"\"\t\"\"\t\"One. Two! Three?\"\t\"\"
43+
\"\"\t\"\"\t\"\"\t\"\"\t\"One Period. Two Sentences\"\t\"\"
44+
\"\"\t\"\"\t\"\"\t\"\"\t\"Three\nlines, one sentence\n\"\t\"\"
45+
"""
46+
47+
# This function allows you to test the mapper with the provided test string
48+
def main():
49+
import StringIO
50+
sys.stdin = StringIO.StringIO(test_text)
51+
mapper()
52+
sys.stdin = sys.__stdin__
53+
54+
if __name__ == "__main__":
55+
main()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/usr/bin/python
2+
"""
3+
Your mapper function should print out 10 lines containing longest posts, sorted in
4+
ascending order from shortest to longest.
5+
Please do not use global variables and do not change the "main" function.
6+
"""
7+
import sys
8+
import csv
9+
10+
11+
def mapper():
12+
reader = csv.reader(sys.stdin, delimiter='\t')
13+
writer = csv.writer(sys.stdout, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)
14+
15+
for line in reader:
16+
17+
18+
19+
writer.writerow(line)
20+
21+
22+
23+
test_text = """\"\"\t\"\"\t\"\"\t\"\"\t\"333\"\t\"\"
24+
\"\"\t\"\"\t\"\"\t\"\"\t\"88888888\"\t\"\"
25+
\"\"\t\"\"\t\"\"\t\"\"\t\"1\"\t\"\"
26+
\"\"\t\"\"\t\"\"\t\"\"\t\"11111111111\"\t\"\"
27+
\"\"\t\"\"\t\"\"\t\"\"\t\"1000000000\"\t\"\"
28+
\"\"\t\"\"\t\"\"\t\"\"\t\"22\"\t\"\"
29+
\"\"\t\"\"\t\"\"\t\"\"\t\"4444\"\t\"\"
30+
\"\"\t\"\"\t\"\"\t\"\"\t\"666666\"\t\"\"
31+
\"\"\t\"\"\t\"\"\t\"\"\t\"55555\"\t\"\"
32+
\"\"\t\"\"\t\"\"\t\"\"\t\"999999999\"\t\"\"
33+
\"\"\t\"\"\t\"\"\t\"\"\t\"7777777\"\t\"\"
34+
"""
35+
36+
# This function allows you to test the mapper with the provided test string
37+
def main():
38+
import StringIO
39+
sys.stdin = StringIO.StringIO(test_text)
40+
mapper()
41+
sys.stdin = sys.__stdin__
42+
43+
main()

0 commit comments

Comments
 (0)
Please sign in to comment.