Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
6439d65
Merge pull request #3 from HIGHFIVE-SW/apirequest
seominjae1 Apr 10, 2025
d57a152
Merge pull request #6 from HIGHFIVE-SW/apirequest
seominjae1 May 13, 2025
cda214c
delete api_request
seominjae1 May 13, 2025
5c2d940
Merge branch 'apirequest' of https://github.com/HIGHFIVE-SW/HIGHFIVE-…
seominjae1 May 13, 2025
374d130
delete api_request
seominjae1 May 13, 2025
c13d73d
add docstring
seominjae1 May 13, 2025
84370fa
Merge pull request #9 from HIGHFIVE-SW/apirequest
seominjae1 May 13, 2025
98a25f7
add databaseconnection
seominjae1 May 14, 2025
80bdc11
Merge pull request #10 from HIGHFIVE-SW/apirequest
seominjae1 May 14, 2025
2582e7f
add swagger
seominjae1 May 15, 2025
f15330f
add paddlepaddle
seominjae1 May 16, 2025
da5b84b
Merge pull request #11 from HIGHFIVE-SW/apirequest
seominjae1 May 16, 2025
e0fb127
include imagestream
seominjae1 May 24, 2025
8ba4c32
Merge pull request #12 from HIGHFIVE-SW/apirequest
seominjae1 May 24, 2025
1183181
ocr api조정
seominjae1 May 26, 2025
9193352
Merge pull request #13 from HIGHFIVE-SW/apirequest
seominjae1 May 26, 2025
075df9d
이미지 다운로드 방식 변경
seominjae1 May 29, 2025
c65ef56
크롤러추가 및 db.py, ext.py, sum_translate.py 수정
urusekai May 30, 2025
cc4639d
Merge branch 'apirequest' of https://github.com/HIGHFIVE-SW/HIGHFIVE-…
urusekai May 30, 2025
406345d
Merge pull request #14 from HIGHFIVE-SW/apirequest
seominjae1 May 30, 2025
cd8dfe3
paddleocr 대신 gemini로 변경
seominjae1 May 31, 2025
a2ae77d
Merge pull request #15 from HIGHFIVE-SW/apirequest
seominjae1 May 31, 2025
abca923
api오류 수정
seominjae1 Jun 1, 2025
4198677
Merge pull request #16 from HIGHFIVE-SW/apirequest
seominjae1 Jun 1, 2025
2178148
ocr오류수정
seominjae1 Jun 1, 2025
ce77472
Merge pull request #17 from HIGHFIVE-SW/apirequest
seominjae1 Jun 1, 2025
32803bb
깃허브 액션을 이용한 ci/cd구축
seominjae1 Jun 2, 2025
3d59b67
Merge pull request #18 from HIGHFIVE-SW/apirequest
seominjae1 Jun 2, 2025
5704ad8
ci/cd테스트
seominjae1 Jun 2, 2025
14de531
ci/cd테스트
seominjae1 Jun 2, 2025
b61e31a
Merge pull request #19 from HIGHFIVE-SW/apirequest
seominjae1 Jun 2, 2025
7c775e0
ci/cd테스트2
seominjae1 Jun 2, 2025
3c5dffd
Merge pull request #20 from HIGHFIVE-SW/apirequest
seominjae1 Jun 2, 2025
3584e1b
ci/cd오류수정
seominjae1 Jun 2, 2025
00ab876
Merge pull request #21 from HIGHFIVE-SW/apirequest
seominjae1 Jun 2, 2025
5752521
ci/cd테스트3
seominjae1 Jun 2, 2025
7205272
Merge pull request #22 from HIGHFIVE-SW/apirequest
seominjae1 Jun 2, 2025
1135d6c
ci/cd오류수정2
seominjae1 Jun 2, 2025
af4de82
Merge pull request #23 from HIGHFIVE-SW/apirequest
seominjae1 Jun 2, 2025
7cbc655
cicd테스트4
seominjae1 Jun 2, 2025
b3d2f00
Merge pull request #24 from HIGHFIVE-SW/apirequest
seominjae1 Jun 2, 2025
5314425
cdcd수정3
seominjae1 Jun 2, 2025
cb8b4bc
Merge pull request #25 from HIGHFIVE-SW/apirequest
seominjae1 Jun 2, 2025
3053e4b
Update deploy.yml
seominjae1 Jun 2, 2025
d13319f
cicd오류수정4
seominjae1 Jun 2, 2025
0f97fbe
Merge branch 'main' into apirequest
seominjae1 Jun 2, 2025
106dafa
Merge pull request #26 from HIGHFIVE-SW/apirequest
seominjae1 Jun 2, 2025
846ac01
cicd오류수정5
seominjae1 Jun 2, 2025
012b79e
Merge branch 'apirequest' of https://github.com/HIGHFIVE-SW/HIGHFIVE-…
seominjae1 Jun 2, 2025
ec21b26
Merge pull request #27 from HIGHFIVE-SW/apirequest
seominjae1 Jun 2, 2025
26bd37b
cicd수정6
seominjae1 Jun 2, 2025
e6839d3
Merge pull request #28 from HIGHFIVE-SW/apirequest
seominjae1 Jun 2, 2025
8d1f93b
cdcd수정7
seominjae1 Jun 2, 2025
56ee75f
Merge pull request #29 from HIGHFIVE-SW/apirequest
seominjae1 Jun 2, 2025
c28196c
cicd수정8
seominjae1 Jun 2, 2025
7a489fb
Merge pull request #30 from HIGHFIVE-SW/apirequest
seominjae1 Jun 2, 2025
0ab7e08
Merge branch 'develop' into main
seominjae1 Jun 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Deploy with GitHub Actions

on:
push:
branches:
- main

jobs:
deploy:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Set execute permissions for deploy script
run: chmod +x ${{ github.workspace }}/deploy_script.sh


- name: Setup SSH Key
run: |
echo "${{ secrets.SSH_PRIVATE_KEY }}" > id_rsa.pem
chmod 600 id_rsa.pem

ssh -i id_rsa.pem -o StrictHostKeyChecking=no [email protected] -p 50735 "cd ~/HIGHFIVE-AI/ && git reset --hard origin/main && git pull origin main && chmod +x deploy_script.sh && ./deploy_script.sh"
56 changes: 0 additions & 56 deletions api_request/reliefweb.py

This file was deleted.

5 changes: 4 additions & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
from flask import Flask
from flask_cors import CORS
from dotenv import load_dotenv
from flasgger import Swagger

from server.logger import logger

#test
# 현재 app.py 파일의 디렉토리 경로를 sys.path에 추가
current_dir = os.path.dirname(os.path.abspath(__file__))
if current_dir not in sys.path:
Expand All @@ -23,6 +24,8 @@
app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "*"}})

swagger=Swagger(app)

# 모든 Blueprint 등록
from chat import chat_bp
app.register_blueprint(chat_bp)
Expand Down
128 changes: 128 additions & 0 deletions crawler/bbc_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import requests
from crawler.keyword_extractor import extract_keyword
from summarization.sum_translate import translate_en_to_ko
from crawler.save_to_db import save_issues
from bs4 import BeautifulSoup
from datetime import datetime
from server.db import run_query

BASE_URL = 'https://web-cdn.api.bbci.co.uk/xd/content-collection/'
COLLECTIONS = {
'natural-wonders' : '9f0b9075-b620-4859-abdc-ed042dd9ee66',
'weather-science' : '696fca43-ec53-418d-a42c-067cb0449ba9',
'climate-solutions' : '5fa7bbe8-5ea3-4bc6-ac7e-546d0dc4a16b',
}
HEADERS = {
'User-Agent': 'Mozilla/5.0'
}
SIZE = 9

def get_last_issue_date():
sql = """
SELECT MAX(issue_date)
FROM issues;
"""
result = run_query(sql)

if result and result[0][0]:
dt = result[0][0]
latest_issue_date = dt.strftime("%Y-%m-%d %H:%M:%S.%f")
return latest_issue_date
else:
return None

def is_end(date, end_time):
date_dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S.%f")
end_time_dt = datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S.%f")
return date_dt <= end_time_dt

def get_datetime(time):
dt = datetime.strptime(time, "%Y-%m-%dT%H:%M:%S.%fZ")
return dt.strftime("%Y-%m-%d %H:%M:%S.%f")

def get_content(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
content_divs = soup.find_all('div', attrs={'data-component': 'text-block'})
contents = [div.get_text(strip=True) for div in content_divs]
full_content = '\n'.join(contents) if contents else "No Content"

return full_content

def get_articles(page, collection_id, end_time):
params = {
'page': page,
'size': SIZE,
}

response = requests.get(BASE_URL + collection_id, params=params, headers=HEADERS)

if not response:
return []

datas = response.json().get('data')
articles = []

for data in datas:
date = get_datetime(data['firstPublishedAt'])

if end_time:
if is_end(date, end_time):
break

title = translate_en_to_ko(data['title'])
keyword = extract_keyword(data['summary'])
summary = translate_en_to_ko(data['summary'])
url = "https://www.bbc.com" + data['path']
image = data['indexImage']['model']['blocks']['src'] or None

articles.append(
{
'content': summary,
'image_url': image,
'issue_date': date,
'keyword': keyword,
'site_url': url,
'title': title,
}
)
print(f"[BBC] 크롤링 완료 : {title}")

return articles

def crawl():
print("[BBC] 크롤링 시작")
results = []
last_issue_date = get_last_issue_date()

if last_issue_date:
print(f"[BBC] DB의 마지막 이슈 이후 데이터만 크롤링 시작 (DATE : {last_issue_date})")
else:
print(f"[BBC] DB에 이슈 없음, 모든 데이터 크롤링 시작")

for category, collection_id in COLLECTIONS.items():
# print(f"[BBC] 카테고리 {category} :")
page = 0

while True:
articles = get_articles(page, collection_id, last_issue_date)

if not articles:
break

results.extend(articles)
page += 1

if results:
print(f"[BBC] 크롤링 완료 : {len(results)}개의 이슈를 크롤링했습니다.")
save_issues(results)
else:
print("[BBC] 크롤링 완료 : 새로운 이슈가 없습니다.")



def main():
crawl()

if __name__ == '__main__':
main()
146 changes: 146 additions & 0 deletions crawler/idealist_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import requests
import json
from datetime import datetime, timedelta, timezone
from crawler.keyword_extractor import extract_keyword
from crawler.save_to_db import save_activities
from server.db import run_query

ENDPOINT = "https://nsv3auess7-dsn.algolia.net/1/indexes/*/queries"
HEADERS = {
"Content-Type": "application/json",
"x-algolia-agent": "Algolia for JavaScript (5.20.0); Search (5.20.0); Browser",
"x-algolia-api-key": "c2730ea10ab82787f2f3cc961e8c1e06",
"x-algolia-application-id": "NSV3AUESS7"
}
DEFAULT_IMAGE_URL = "https://www.idealist.org/assets/417d88fd628db1c1ac861f3ea8db58c1a159d52a/images/icons/action-opps/action-opps-volunteermatch.svg"

def get_last_timestamp():
sql = """
SELECT start_date
FROM activities
WHERE activity_site = 'IDEALIST'
ORDER BY start_date DESC
LIMIT 1;
"""
last_timestamp = run_query(sql)

if last_timestamp:
dt = last_timestamp[0][0].replace(tzinfo=timezone.utc)
return int(dt.timestamp())
else:
return 0

def build_payload(page, type='volunteer', timestamp=0):
if type == 'volunteer':
filters = f"actionType:'VOLOP' AND published > {timestamp}"
index_name = "idealist7-production-action-opps"
else:
filters = f"type:'INTERNSHIP' AND published > {timestamp}"
index_name = "idealist7-production"

return {
"requests": [
{
"indexName": index_name,
"facets": ["*"],
"hitsPerPage": 100,
"attributesToSnippet": ["description:20"],
"attributesToRetrieve": ["*"],
"filters": filters,
"removeStopWords": True,
"ignorePlurals": True,
"advancedSyntax": True,
"queryLanguages": ["en"],
"page": page,
"query": "",
"getRankingInfo": True,
"clickAnalytics": True,
"analytics": True
}
]
}

def get_url(item):
url = item.get("url")
if isinstance(url, str):
return url
elif isinstance(url, dict):
return "https://www.idealist.org" + next(iter(url.values()), "")
return ""

def get_image(item):
img = item.get("imageUrl") or DEFAULT_IMAGE_URL
return img

def get_published(item):
timestamp = item.get("published")
return datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S.%f')

def get_activities(page, timestamp, type):
payload = build_payload(page, type, timestamp)
response = requests.post(ENDPOINT, headers=HEADERS, json=payload)

try:
data = response.json()["results"][0]["hits"]
except Exception as e:
print(f"[!] JSON 파싱 에러: {e}")
return None

result = []

if data:
for item in data:
activity_type = "VOLUNTEER" if type=='volunteer' else 'INTERNSHIP'
activity_content = item.get("description")
activity_name = item.get("name")
activity_image_url = get_image(item)
activity_url = get_url(item)
start_date = get_published(item)
end_date = None
keyword = extract_keyword(activity_content)

result.append(
{
"activity_site": "IDEALIST",
"activity_type": activity_type,
"activity_content": activity_content,
"end_date": end_date,
"activity_image_url": activity_image_url,
"keyword": keyword,
"activity_name": activity_name,
"site_url": activity_url,
"start_date": start_date
}
)
print(f"[IDEALIST] 크롤링 완료 : {item.get("name", '')}")
return result
else:
return None

def crawl():
print("[IDEALIST] 크롤링 시작")
crawled_activities = []
last_timestamp = get_last_timestamp()

if last_timestamp > 0:
print(f"[IDEALIST] DB의 마지막 활동 이후 데이터만 크롤링 시작 (TIMESTAMP: {last_timestamp})")
else:
print(f"[IDEALIST] DB에 활동 없음, 모든 데이터 크롤링 시작")

for type in ['volunteer', 'internship']:
page = 0
while True:
activities = get_activities(page, last_timestamp, type)
if not activities:
break
crawled_activities.extend(activities)
page += 1

if crawled_activities:
print(f"[IDEALIST] 크롤링 완료 : {len(crawled_activities)}개의 활동을 크롤링했습니다.")
save_activities(crawled_activities)
else:
print("[IDEALIST] 크롤링 완료 : 새로운 활동이 없습니다.")

if __name__ == "__main__":
crawl()
Loading