|
| 1 | +#!/usr/bin/python |
| 2 | +import sys |
| 3 | +import csv |
| 4 | +import re |
| 5 | + |
| 6 | +# To run this code on the actual data, please download the additional dataset. |
| 7 | +# You can find instructions in the course materials (wiki) and in instructor comments. |
| 8 | +# There are some things in this data file that are different from what you saw |
| 9 | +# in Lesson 3. This dataset is more complicated, and closer to what you |
| 10 | +# would see in the real world. It was generated by exporting data from |
| 11 | +# an SQL database. |
| 12 | +# Since the data in at least one of the fields (the body field) can include new lines, |
| 13 | +# and all the fields are enclosed in double quotes, |
| 14 | +# you should use a less naive way of processing the data file (instead of split(",")). |
| 15 | +# We have provided sample code on how to use the csv module of Python. |
| 16 | +# "line" in this case will be an array that contains all the fields |
| 17 | +# similar to using split in the previous lesson. |
| 18 | +########################################################################### |
| 19 | +# In this exercise you are interested in the field "body" which is the 5th field. |
| 20 | +# Find forum nodes where "body" contains only one sentence. |
| 21 | +# We define sentence as a "body" that contains either none of the following |
| 22 | +# 3 punctuation marks ".!?" , or only one of them as the last character in the body. |
| 23 | +# You should not parse the HTML inside body, or pay attention to new lines. |
| 24 | + |
| 25 | + |
| 26 | +def mapper(): |
| 27 | + reader = csv.reader(sys.stdin, delimiter='\t') |
| 28 | + writer = csv.writer(sys.stdout, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL) |
| 29 | + |
| 30 | + for line in reader: |
| 31 | + body = line[4] |
| 32 | + body_without_last_char = body[:-1] |
| 33 | + if len(re.findall("\.|\?|!", body_without_last_char)) > 0: |
| 34 | + continue |
| 35 | + else: |
| 36 | + writer.writerow(line) |
| 37 | + |
| 38 | + |
| 39 | +test_text = """\"\"\t\"\"\t\"\"\t\"\"\t\"This is one sentence\"\t\"\" |
| 40 | +\"\"\t\"\"\t\"\"\t\"\"\t\"Also one sentence!\"\t\"\" |
| 41 | +\"\"\t\"\"\t\"\"\t\"\"\t\"Hey!\nTwo sentences!\"\t\"\" |
| 42 | +\"\"\t\"\"\t\"\"\t\"\"\t\"One. Two! Three?\"\t\"\" |
| 43 | +\"\"\t\"\"\t\"\"\t\"\"\t\"One Period. Two Sentences\"\t\"\" |
| 44 | +\"\"\t\"\"\t\"\"\t\"\"\t\"Three\nlines, one sentence\n\"\t\"\" |
| 45 | +""" |
| 46 | + |
| 47 | +# This function allows you to test the mapper with the provided test string |
| 48 | +def main(): |
| 49 | + import StringIO |
| 50 | + sys.stdin = StringIO.StringIO(test_text) |
| 51 | + mapper() |
| 52 | + sys.stdin = sys.__stdin__ |
| 53 | + |
| 54 | +if __name__ == "__main__": |
| 55 | + main() |
0 commit comments