Skip to content

Commit ff5b359

Browse files
authored
Merge pull request #51 from HIGHFIVE-SW/develop
Develop
2 parents af80bdb + eae9178 commit ff5b359

File tree

4 files changed

+88
-14
lines changed

4 files changed

+88
-14
lines changed

app.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333
from ocr import ocr_bp
3434
app.register_blueprint(ocr_bp)
3535

36+
from crawler import crawler_bp
37+
app.register_blueprint(crawler_bp)
38+
3639
swagger=Swagger(app)
3740

3841
# logger.debug('DEBUG logging test.')

crawler/__init__.py

Lines changed: 78 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,88 @@
22
from crawler.main_crawler import run_crawlers
33
from server.logger import logger
44

5-
crawler_bp = Blueprint('crawler', __name__, url_prefix='/crawler')
5+
import threading
66

7-
@crawler_bp.route('/', methods=['GET'])
8-
def crawler():
9-
# GET 파라미터에서 targets 가져오기
10-
targets_param = request.args.get('targets') # 예: 'bbc,wevity'
7+
crawler_bp = Blueprint('crawler', __name__, url_prefix='/crawler')
118

12-
# 파라미터가 있다면 리스트로 분할
13-
if targets_param:
14-
targets = [t.strip() for t in targets_param.split(',')]
15-
else:
16-
targets = None # 전체 실행
9+
# 상태 관리 변수
10+
status = {
11+
"state": "idle", # idle | running | done | error
12+
"error": None,
13+
"targets": None
14+
}
15+
lock = threading.Lock()
16+
17+
def background_crawl(targets):
18+
global status
1719

1820
try:
21+
with lock:
22+
status["state"] = "running"
23+
status["error"] = None
24+
status["targets"] = targets or "all"
25+
1926
run_crawlers(targets)
20-
return jsonify({"message": "크롤러 실행 완료", "targets": targets or "all"}), 200
27+
28+
with lock:
29+
status["state"] = "done"
2130
except Exception as e:
2231
logger.error(f"크롤러 실행 중 오류: {e}")
23-
return jsonify({"error": str(e)}), 500
32+
with lock:
33+
status["state"] = "error"
34+
status["error"] = str(e)
35+
finally:
36+
pass
37+
38+
@crawler_bp.route('/run', methods=['GET'])
39+
def start_crawler():
40+
"""크롤러를 시작하는 API 엔드포인트
41+
42+
Query Parameters:
43+
targets (str, optional): 쉼표로 구분된 크롤링 대상 목록
44+
예: "bbc,wevity"
45+
미지정시 모든 대상을 크롤링
46+
47+
Returns:
48+
JSON:
49+
- 성공시 (202):
50+
{
51+
"message": "크롤러 실행 시작됨",
52+
"targets": "<target_list>" or "all"
53+
}
54+
- 실행 중일 때 (429):
55+
{
56+
"message": "이미 크롤러가 실행 중입니다."
57+
}
58+
"""
59+
targets_param = request.args.get('targets')
60+
targets = [t.strip() for t in targets_param.split(',')] if targets_param else None
61+
62+
with lock:
63+
if status["state"] == "running":
64+
return jsonify({"message": "이미 크롤러가 실행 중입니다."}), 429
65+
66+
thread = threading.Thread(target=background_crawl, args=(targets,))
67+
thread.start()
68+
69+
return jsonify({"message": "크롤러 실행 시작됨", "targets": targets or "all"}), 202
70+
71+
72+
@crawler_bp.route('/status', methods=['GET'])
73+
def get_crawler_status():
74+
"""현재 크롤러의 상태를 조회하는 API 엔드포인트
75+
76+
Returns:
77+
JSON (200):
78+
{
79+
"state": "idle" | "running" | "done" | "error",
80+
"error": "오류 메시지" or null,
81+
"targets": ["대상1", "대상2"] or "all" or null
82+
}
83+
"""
84+
with lock:
85+
return jsonify({
86+
"state": status["state"],
87+
"error": status["error"],
88+
"targets": status["targets"]
89+
}), 200

crawler/bbc_crawler.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
COLLECTIONS = {
1010
'natural-wonders' : '9f0b9075-b620-4859-abdc-ed042dd9ee66',
1111
'weather-science' : '696fca43-ec53-418d-a42c-067cb0449ba9',
12-
'climate-solutions' : '5fa7bbe8-5ea3-4bc6-ac7e-546d0dc4a16b',
12+
'climate-solutions' : '5fa7bbe8-5ea3-4bc6-ac7e-546d0dc4a16b',
13+
'world' : '07cedf01-f642-4b92-821f-d7b324b8ba73',
14+
'innovation' : '3da03ce0-ee41-4427-a5d9-1294491e0448'
1315
}
1416
HEADERS = {
1517
'User-Agent': 'Mozilla/5.0'
@@ -76,7 +78,6 @@ def get_articles(page, collection_id, end_time):
7678
url = "https://www.bbc.com" + data['path']
7779
content = get_content(url)
7880
title, content, keyword = summarize_and_categorize_issue(data['title'], content)
79-
print(content)
8081
image = data['indexImage']['model']['blocks']['src'] or None
8182

8283
articles.append(

server/logger.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,10 @@ def filter(self, record):
181181

182182
logger = logging.getLogger(__name__)
183183

184+
# 로깅 제거
185+
logging.getLogger("httpx").setLevel(logging.WARNING)
186+
logging.getLogger("httpcore").setLevel(logging.WARNING)
187+
logging.getLogger("urllib3").setLevel(logging.WARNING)
184188

185189
if __name__ == '__main__':
186190
# 모듈 사용 테스트

0 commit comments

Comments
 (0)