Skip to content

Commit

Permalink
Merge pull request #26 from geun9716/master
Browse files Browse the repository at this point in the history
날짜 지정 기능 추가 (Add Date Assignment feature)
  • Loading branch information
lumyjuwon authored Mar 27, 2022
2 parents b67bf84 + 5553782 commit 6a9ba0d
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 35 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2020 lumyjuwon
Copyright (c) 2022 lumyjuwon

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
Binary file added dist/KoreaNewsCrawler-1.51-py3-none-any.whl
Binary file not shown.
94 changes: 70 additions & 24 deletions korea_news_crawler/articlecrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(self):
self.categories = {'정치': 100, '경제': 101, '사회': 102, '생활문화': 103, '세계': 104, 'IT과학': 105, '오피니언': 110,
'politics': 100, 'economy': 101, 'society': 102, 'living_culture': 103, 'world': 104, 'IT_science': 105, 'opinion': 110}
self.selected_categories = []
self.date = {'start_year': 0, 'start_month': 0, 'end_year': 0, 'end_month': 0}
self.date = {'start_year': 0, 'start_month': 0, 'start_day' : 0, 'end_year': 0, 'end_month': 0, 'end_day':0}
self.user_operating_system = str(platform.system())

def set_category(self, *args):
Expand All @@ -27,47 +27,94 @@ def set_category(self, *args):
raise InvalidCategory(key)
self.selected_categories = args

def set_date_range(self, start_year, start_month, end_year, end_month):
args = [start_year, start_month, end_year, end_month]
def set_date_range(self, start_date:str, end_date:str):
start = list(map(int, start_date.split("-")))
end = list(map(int, end_date.split("-")))

# Setting Start Date
if len(start) == 1: # Input Only Year
start_year = start[0]
start_month = 1
start_day = 1
elif len(start) == 2: # Input Year and month
start_year, start_month = start
start_day = 1
elif len(start) == 3: # Input Year, month and day
start_year, start_month, start_day = start

# Setting End Date
if len(end) == 1: # Input Only Year
end_year = end[0]
end_month = 12
end_day = 31
elif len(end) == 2: # Input Year and month
end_year, end_month = end
end_day = calendar.monthrange(end_year, end_month)[1]
elif len(end) == 3: # Input Year, month and day
end_year, end_month, end_day = end

args = [start_year, start_month, start_day, end_year, end_month, end_day]

if start_year > end_year:
raise InvalidYear(start_year, end_year)
if start_month < 1 or start_month > 12:
raise InvalidMonth(start_month)
if end_month < 1 or end_month > 12:
raise InvalidMonth(end_month)
if start_day < 1 or calendar.monthrange(start_year, start_month)[1] < start_day:
raise InvalidDay(start_day)
if end_day < 1 or calendar.monthrange(end_year, end_month)[1] < end_day:
raise InvalidDay(end_day)
if start_year == end_year and start_month > end_month:
raise OverbalanceMonth(start_month, end_month)
if start_year == end_year and start_month == end_month and start_day > end_day:
raise OverbalanceDay(start_day, end_day)

for key, date in zip(self.date, args):
self.date[key] = date
print(self.date)

@staticmethod
def make_news_page_url(category_url, start_year, end_year, start_month, end_month):
def make_news_page_url(category_url, date):
made_urls = []
for year in range(start_year, end_year + 1):
target_start_month = start_month
target_end_month = end_month

if start_year != end_year:
if year == start_year:
target_start_month = start_month
for year in range(date['start_year'], date['end_year'] + 1):
if date['start_year'] == date['end_year']:
target_start_month = date['start_month']
target_end_month = date['end_month']
else:
if year == date['start_year']:
target_start_month = date['start_month']
target_end_month = 12
elif year == end_year:
elif year == date['end_year']:
target_start_month = 1
target_end_month = end_month
target_end_month = date['end_month']
else:
target_start_month = 1
target_end_month = 12

for month in range(target_start_month, target_end_month + 1):
for month_day in range(1, calendar.monthrange(year, month)[1] + 1):
if date['start_month'] == date['end_month']:
target_start_day = date['start_day']
target_end_day = date['end_day']
else:
if year == date['start_year'] and month == date['start_month']:
target_start_day = date['start_day']
target_end_day = calendar.monthrange(year, month)[1]
elif year == date['end_year'] and month == date['end_month']:
target_start_day = 1
target_end_day = date['end_day']
else:
target_start_day = 1
target_end_day = calendar.monthrange(year, month)[1]

for day in range(target_start_day, target_end_day + 1):
if len(str(month)) == 1:
month = "0" + str(month)
if len(str(month_day)) == 1:
month_day = "0" + str(month_day)
if len(str(day)) == 1:
day = "0" + str(day)

# 날짜별로 Page Url 생성
url = category_url + str(year) + str(month) + str(month_day)
url = category_url + str(year) + str(month) + str(day)

# totalpage는 네이버 페이지 구조를 이용해서 page=10000으로 지정해 totalpage를 알아냄
# page=10000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨 (Redirect)
Expand All @@ -94,12 +141,11 @@ def crawling(self, category_name):
writer = Writer(category='Article', article_category=category_name, date=self.date)
# 기사 url 형식
url_format = f'http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1={self.categories.get(category_name)}&date='
# start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다.
target_urls = self.make_news_page_url(url_format, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month'])

print(category_name + " Urls are generated")
print("The crawler starts")
# start_year년 start_month월 start_day일 부터 ~ end_year년 end_month월 end_day일까지 기사를 수집합니다.
target_urls = self.make_news_page_url(url_format, self.date)
print(f'{category_name} Urls are generated')

print(f'{category_name} is collecting ...')
for url in target_urls:
request = self.get_url_data(url)
document = BeautifulSoup(request.content, 'html.parser')
Expand Down Expand Up @@ -186,5 +232,5 @@ def start(self):
if __name__ == "__main__":
Crawler = ArticleCrawler()
Crawler.set_category('생활문화')
Crawler.set_date_range(2018, 1, 2018, 2)
Crawler.set_date_range('2018-01', '2018-02')
Crawler.start()
16 changes: 16 additions & 0 deletions korea_news_crawler/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ def __init__(self, month):
def __str__(self):
return self.message

# 일이 올바르지 않을 때
class InvalidDay(Exception):
def __init__(self, day):
self.message = f'{day} is an invalid day'

def __str__(self):
return self.message



# 시작 달과 끝나는 달이 올바르지 않을 때
class OverbalanceMonth(Exception):
Expand All @@ -62,6 +71,13 @@ def __init__(self, start_month, end_month):
def __str__(self):
return self.message

class OverbalanceDay(Exception):
def __init__(self, start_day, end_day):
self.message = f'{start_day}(start day) is an overbalance with {end_day}(end day)'

def __str__(self):
return self.message


# 실행시간이 너무 길어서 데이터를 얻을 수 없을 때
class ResponseTimeout(Exception):
Expand Down
7 changes: 3 additions & 4 deletions korea_news_crawler/sample.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from korea_news_crawler.articlecrawler import ArticleCrawler

if __name__ == "__main__":
Crawler = ArticleCrawler()
# 정치, 경제, 생활문화, IT과학, 사회, 세계 카테고리 사용 가능
Crawler.set_category("IT과학", "경제", "생활문화", "IT과학", "사회", "세계")
# 2017년 12월부터 2018년 1월까지 크롤링 시작
Crawler.set_date_range(2017, 12, 2018, 1)
Crawler.set_category("IT과학", "세계")
# 2017년 12월 (1일) 부터 2018년 1월 13일까지 크롤링 시작 YYYY-MM-DD의 형식으로 입력
Crawler.set_date_range('2017-12', '2018-01-13')
Crawler.start()
5 changes: 3 additions & 2 deletions korea_news_crawler/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ class Writer(object):
def __init__(self, category, article_category, date):
self.start_year = date['start_year']
self.start_month = f'0{date["start_month"]}' if len(str(date['start_month'])) == 1 else str(date['start_month'])
self.start_day = f'0{date["start_day"]}' if len(str(date['start_day'])) == 1 else str(date['start_day'])
self.end_year = date['end_year']
self.end_month = f'0{date["end_month"]}' if len(str(date['end_month'])) == 1 else str(date['end_month'])

self.end_day = f'0{date["end_day"]}' if len(str(date['end_day'])) == 1 else str(date['end_day'])
self.file = None
self.initialize_file(category, article_category)

Expand All @@ -20,7 +21,7 @@ def initialize_file(self, category, article_category):
if os.path.exists(output_path) is not True:
os.mkdir(output_path)

file_name = f'{output_path}/{category}_{article_category}_{self.start_year}{self.start_month}_{self.end_year}{self.end_month}.csv'
file_name = f'{output_path}/{category}_{article_category}_{self.start_year}{self.start_month}{self.start_day}_{self.end_year}{self.end_month}{self.end_day}.csv'
if os.path.isfile(file_name):
raise ExistFile(file_name)

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[metadata]
description-file = README.md
description_file = README.md
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from setuptools import setup

# build package command: python setup.py bdist_wheel
# release package command: twine upload dist/KoreaNewsCrawler-version-py3-none-any.whl
# release package command: twine upload dist/KoreaNewsCrawler-${version}-py3-none-any.whl

setup(
name = 'KoreaNewsCrawler',
version = '1.50',
version = '1.51',
description = 'Crawl the korean news',
author = 'lumyjuwon',
author_email = '[email protected]',
url = 'https://github.com/lumyjuwon/KoreaNewsCrawler',
download_url = 'https://github.com/lumyjuwon/KoreaNewsCrawler/archive/1.50.tar.gz',
download_url = 'https://github.com/lumyjuwon/KoreaNewsCrawler/archive/1.51.tar.gz',
install_requires = ['requests', 'beautifulsoup4'],
packages = ['korea_news_crawler'],
keywords = ['crawl', 'KoreaNews', 'crawler'],
Expand Down

0 comments on commit 6a9ba0d

Please sign in to comment.