HIGHFIVE-SW · seominjae1 · Jun 5, 2025 · Apr 10, 2025 · May 13, 2025 · May 13, 2025
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -0,0 +1,24 @@
+name: Deploy with GitHub Actions
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Set execute permissions for deploy script
+        run: chmod +x ${{ github.workspace }}/deploy_script.sh
+
+
+      - name: Setup SSH Key
+        run: |
+          echo "${{ secrets.SSH_PRIVATE_KEY }}" > id_rsa.pem
+          chmod 600 id_rsa.pem
+
+          ssh -i id_rsa.pem -o StrictHostKeyChecking=no [email protected] -p 50735 "cd ~/HIGHFIVE-AI/ && git reset --hard origin/main && git pull origin main && chmod +x deploy_script.sh && ./deploy_script.sh"
diff --git a/api_request/reliefweb.py b/api_request/reliefweb.py
diff --git a/app.py b/app.py
@@ -3,9 +3,10 @@
 from flask import Flask
 from flask_cors import CORS
 from dotenv import load_dotenv
+from flasgger import Swagger
 
 from server.logger import logger
-
+#test
 # 현재 app.py 파일의 디렉토리 경로를 sys.path에 추가
 current_dir = os.path.dirname(os.path.abspath(__file__))
 if current_dir not in sys.path:
@@ -23,6 +24,8 @@
 app = Flask(__name__)
 CORS(app, resources={r"/*": {"origins": "*"}})
 
+swagger=Swagger(app)
+
 # 모든 Blueprint 등록
 from chat import chat_bp
 app.register_blueprint(chat_bp)

diff --git a/crawler/bbc_crawler.py b/crawler/bbc_crawler.py
@@ -0,0 +1,128 @@
+import requests
+from crawler.keyword_extractor import extract_keyword
+from summarization.sum_translate import translate_en_to_ko
+from crawler.save_to_db import save_issues
+from bs4 import BeautifulSoup
+from datetime import datetime
+from server.db import run_query
+
+BASE_URL = 'https://web-cdn.api.bbci.co.uk/xd/content-collection/'
+COLLECTIONS = {
+    'natural-wonders' : '9f0b9075-b620-4859-abdc-ed042dd9ee66',
+    'weather-science' : '696fca43-ec53-418d-a42c-067cb0449ba9',
+    'climate-solutions' : '5fa7bbe8-5ea3-4bc6-ac7e-546d0dc4a16b',
+}
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0'
+}
+SIZE = 9
+
+def get_last_issue_date():
+    sql = """
+        SELECT MAX(issue_date)
+        FROM issues;
+    """
+    result = run_query(sql)
+
+    if result and result[0][0]:
+        dt = result[0][0]
+        latest_issue_date = dt.strftime("%Y-%m-%d %H:%M:%S.%f")
+        return latest_issue_date
+    else:
+        return None
+
+def is_end(date, end_time):
+    date_dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S.%f")
+    end_time_dt = datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S.%f")
+    return date_dt <= end_time_dt
+
+def get_datetime(time):
+    dt = datetime.strptime(time, "%Y-%m-%dT%H:%M:%S.%fZ")
+    return dt.strftime("%Y-%m-%d %H:%M:%S.%f")
+
+def get_content(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, "html.parser")
+    content_divs = soup.find_all('div', attrs={'data-component': 'text-block'})
+    contents = [div.get_text(strip=True) for div in content_divs]
+    full_content = '\n'.join(contents) if contents else "No Content"
+
+    return full_content
+
+def get_articles(page, collection_id, end_time):
+    params = {
+        'page': page,
+        'size': SIZE,
+    }
+
+    response = requests.get(BASE_URL + collection_id, params=params, headers=HEADERS)
+
+    if not response:
+        return []
+
+    datas = response.json().get('data')
+    articles = []
+
+    for data in datas:
+        date = get_datetime(data['firstPublishedAt'])
+
+        if end_time:
+            if is_end(date, end_time):
+                break
+
+        title = translate_en_to_ko(data['title'])
+        keyword = extract_keyword(data['summary'])
+        summary = translate_en_to_ko(data['summary'])
+        url = "https://www.bbc.com" + data['path']
+        image = data['indexImage']['model']['blocks']['src'] or None
+
+        articles.append(
+            {
+                'content': summary,
+                'image_url': image,
+                'issue_date': date,
+                'keyword': keyword,
+                'site_url': url,
+                'title': title,
+            }
+        )
+        print(f"[BBC] 크롤링 완료 : {title}")  
+
+    return articles
+
+def crawl():
+    print("[BBC] 크롤링 시작")
+    results = []
+    last_issue_date = get_last_issue_date()
+
+    if last_issue_date:
+        print(f"[BBC] DB의 마지막 이슈 이후 데이터만 크롤링 시작 (DATE : {last_issue_date})")
+    else:
+        print(f"[BBC] DB에 이슈 없음, 모든 데이터 크롤링 시작")
+
+    for category, collection_id in COLLECTIONS.items():
+        # print(f"[BBC] 카테고리 {category} :")
+        page = 0
+
+        while True:
+            articles = get_articles(page, collection_id, last_issue_date)
+
+            if not articles:
+                break
+
+            results.extend(articles)
+            page += 1
+
+    if results:
+        print(f"[BBC] 크롤링 완료 : {len(results)}개의 이슈를 크롤링했습니다.")
+        save_issues(results)
+    else:
+        print("[BBC] 크롤링 완료 : 새로운 이슈가 없습니다.")
+
+
+
+def main():
+    crawl()
+
+if __name__ == '__main__':
+    main()
diff --git a/crawler/idealist_crawler.py b/crawler/idealist_crawler.py
@@ -0,0 +1,146 @@
+import requests
+import json
+from datetime import datetime, timedelta, timezone
+from crawler.keyword_extractor import extract_keyword
+from crawler.save_to_db import save_activities
+from server.db import run_query
+
+ENDPOINT = "https://nsv3auess7-dsn.algolia.net/1/indexes/*/queries"
+HEADERS = {
+    "Content-Type": "application/json",
+    "x-algolia-agent": "Algolia for JavaScript (5.20.0); Search (5.20.0); Browser",
+    "x-algolia-api-key": "c2730ea10ab82787f2f3cc961e8c1e06",
+    "x-algolia-application-id": "NSV3AUESS7"
+}
+DEFAULT_IMAGE_URL = "https://www.idealist.org/assets/417d88fd628db1c1ac861f3ea8db58c1a159d52a/images/icons/action-opps/action-opps-volunteermatch.svg"
+
+def get_last_timestamp():
+    sql = """
+        SELECT start_date
+        FROM activities
+        WHERE activity_site = 'IDEALIST'
+        ORDER BY start_date DESC
+        LIMIT 1;
+    """
+    last_timestamp = run_query(sql)
+
+    if last_timestamp:
+        dt = last_timestamp[0][0].replace(tzinfo=timezone.utc)
+        return int(dt.timestamp())
+    else:
+        return 0
+
+def build_payload(page, type='volunteer', timestamp=0):
+    if type == 'volunteer':
+        filters = f"actionType:'VOLOP' AND published > {timestamp}"
+        index_name = "idealist7-production-action-opps"
+    else:
+        filters = f"type:'INTERNSHIP' AND published > {timestamp}"
+        index_name = "idealist7-production"
+
+    return {
+        "requests": [
+            {
+                "indexName": index_name,
+                "facets": ["*"],
+                "hitsPerPage": 100,
+                "attributesToSnippet": ["description:20"],
+                "attributesToRetrieve": ["*"],
+                "filters": filters,
+                "removeStopWords": True,
+                "ignorePlurals": True,
+                "advancedSyntax": True,
+                "queryLanguages": ["en"],
+                "page": page,
+                "query": "",
+                "getRankingInfo": True,
+                "clickAnalytics": True,
+                "analytics": True
+            }
+        ]
+    }
+
+def get_url(item):
+    url = item.get("url")
+    if isinstance(url, str):
+        return url
+    elif isinstance(url, dict):
+        return "https://www.idealist.org" + next(iter(url.values()), "")
+    return ""
+
+def get_image(item):
+    img = item.get("imageUrl") or DEFAULT_IMAGE_URL
+    return img
+
+def get_published(item):
+    timestamp = item.get("published")
+    return datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S.%f')
+
+def get_activities(page, timestamp, type):
+    payload = build_payload(page, type, timestamp)
+    response = requests.post(ENDPOINT, headers=HEADERS, json=payload)
+
+    try:
+        data = response.json()["results"][0]["hits"]
+    except Exception as e:
+        print(f"[!] JSON 파싱 에러: {e}")
+        return None
+
+    result = []
+
+    if data:
+        for item in data:
+            activity_type = "VOLUNTEER" if type=='volunteer' else 'INTERNSHIP'
+            activity_content = item.get("description")
+            activity_name = item.get("name")
+            activity_image_url = get_image(item)
+            activity_url = get_url(item)
+            start_date = get_published(item)
+            end_date = None
+            keyword = extract_keyword(activity_content)
+
+            result.append(
+                {
+                    "activity_site": "IDEALIST",
+                    "activity_type": activity_type,
+                    "activity_content": activity_content,
+                    "end_date": end_date,
+                    "activity_image_url": activity_image_url,
+                    "keyword": keyword,
+                    "activity_name": activity_name,
+                    "site_url": activity_url,
+                    "start_date": start_date
+                }
+            )
+            print(f"[IDEALIST] 크롤링 완료 : {item.get("name", '')}")
+        return result
+    else:
+        return None
+
+def crawl():
+    print("[IDEALIST] 크롤링 시작")
+    crawled_activities = []
+    last_timestamp = get_last_timestamp()
+
+    if last_timestamp > 0:
+        print(f"[IDEALIST] DB의 마지막 활동 이후 데이터만 크롤링 시작 (TIMESTAMP: {last_timestamp})")
+    else:
+        print(f"[IDEALIST] DB에 활동 없음, 모든 데이터 크롤링 시작")
+
+    for type in ['volunteer', 'internship']:
+        page = 0
+        while True:
+            activities = get_activities(page, last_timestamp, type)
+            if not activities:
+                break
+            crawled_activities.extend(activities)
+            page += 1
+
+    if crawled_activities:
+        print(f"[IDEALIST] 크롤링 완료 : {len(crawled_activities)}개의 활동을 크롤링했습니다.")
+        save_activities(crawled_activities)
+    else:
+        print("[IDEALIST] 크롤링 완료 : 새로운 활동이 없습니다.") 
+
+if __name__ == "__main__":
+    crawl()