Japan2026/main.py at main · ImCodingCat/Japan2026 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import requests
import json
import base64
import os
import re
import time
from pathlib import Path
from dotenv import load_dotenv
import instaloader
import concurrent.futures

from datetime import datetime

load_dotenv()

# Load OPEN_ROUTER_API_KEY from .env

def file_to_base64(video_path):
    with open(video_path, "rb") as video_file:
        return base64.b64encode(video_file.read()).decode("utf-8")


def file_extension_to_mime(path: Path):
    extension = path.suffix
    if extension == ".jpg":
        return "jpeg"
    return extension[1:]


api_key = os.getenv("OPEN_ROUTER_API_KEY")


def extract_shortcode_from_url(instagram_url):
    """
    Extract shortcode from Instagram post or reel URL.

    Supports formats:
    - https://www.instagram.com/p/SHORTCODE/
    - https://www.instagram.com/reel/SHORTCODE/
    - https://instagram.com/p/SHORTCODE/
    - https://instagram.com/reel/SHORTCODE/
    """
    pattern = r"instagram\.com/(?:p|reel)/([A-Za-z0-9_-]+)"
    match = re.search(pattern, instagram_url)
    if match:
        return match.group(1)
    raise ValueError(f"Could not extract shortcode from URL: {instagram_url}")


def _download_instagram_post(shortcode, download_dir):
    download_dir = os.path.join(download_dir, shortcode)
    os.makedirs(download_dir, exist_ok=True)

    # Initialize Instaloader
    L = instaloader.Instaloader(
        download_videos=True,
        download_video_thumbnails=False,
        download_geotags=False,
        download_comments=False,
        save_metadata=False,
        compress_json=False,
        dirname_pattern=download_dir,
        quiet=True
    )

    # Get post from shortcode
    post = instaloader.Post.from_shortcode(L.context, shortcode)

    # Download the post
    L.download_post(post, target=download_dir)

    # Find the downloaded video file
    download_path = Path(download_dir)
    video_files = list(download_path.glob(f"*.mp4"))

    if video_files:
        return (True, video_files)

    image_extensions = ("*.png", "*.jpg", "*.jpeg")

    image_files = []

    for files in image_extensions:
        image_files.extend(download_path.glob(files))

    return (False, image_files)

def download_instagram_post(shortcode, download_dir="./downloads"):
    for i in range(5):
        try:
            return _download_instagram_post(shortcode, download_dir)
        except Exception as e:
            print(f"Error downloading post: {e}")
            time.sleep(3)
            if i == 4:
                raise e

def summarize_video(video_path):
    """
    Summarize video content using OpenRouter API with Gemini model.

    Args:
        video_path: Path to the video file

    Returns:
        Summary text from the API
    """
    # Encode video to base64
    base64_video = file_to_base64(video_path)
    data_url = f"data:video/mp4;base64,{base64_video}"

    # Prepare API request
    api_key = os.getenv("OPEN_ROUTER_API_KEY")
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": """
Analyze this video and extract its core message or content. Focus on:

1. **Text content**: If there's any text, captions, or on-screen writing, transcribe and summarize it
2. **Audio/Speech**: If there's narration, dialogue, or voiceover, summarize what's being said
3. **Main message**: What is the video trying to communicate or teach?

DO NOT describe:
- Background visuals or scenery
- What people are doing physically
- Camera movements or video effects
- Background music (unless it's relevant to the message)

Output format:
- If there's text: Provide the text content
- If there's audio/speech: Summarize the spoken message
- Overall message: What is this video about/trying to convey?

Focus only on the informational or communicative content, not the visual presentation.
""",
                },
                {"type": "video_url", "video_url": {"url": data_url}},
            ],
        }
    ]

    payload = {"model": "google/gemini-2.5-flash-preview-09-2025", "messages": messages}

    response = requests.post(url, headers=headers, json=payload)
    response_text = response.text
    try:
        result = json.loads(response_text)

        if "choices" in result and len(result["choices"]) > 0:
            return result["choices"][0]["message"]["content"]
        else:
            return result
    except:
        print(response_text)


def summarize_image(images: list[Path]):
    """
    Summarize video content using OpenRouter API with Gemini model.

    Args:
        video_path: Path to the video file

    Returns:
        Summary text from the API
    """
    # Encode video to base64

    base64_images = [
        f"data:image/{file_extension_to_mime(path)};base64,{file_to_base64(path)}"
        for path in images
    ]

    # Prepare API request
    api_key = os.getenv("OPEN_ROUTER_API_KEY")
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}

    content = [
        {
            "type": "text",
            "text": """
Analyze this images and extract its core content or message. Focus on:

1. **Text content**: If there's any text, captions, labels, or writing, transcribe it completely
2. **Infographic data**: If it contains charts, graphs, statistics, or data visualizations, extract the key information
3. **Main message**: What information is this image trying to communicate?

DO NOT describe:
- Background aesthetics or design elements
- Colors, fonts, or styling (unless crucial to understanding)
- General scene descriptions
- Decorative elements

Output format:
- Only output the entire summarization

Focus only on extracting the informational content, not describing how it looks.
""",
        },
    ]

    for image in base64_images:
        content.append({"type": "image_url", "image_url": {"url": image}})

    messages = [{"role": "user", "content": content}]

    payload = {"model": "qwen/qwen3-vl-30b-a3b-instruct", "messages": messages}

    response = requests.post(url, headers=headers, json=payload)
    result = response.json()
    if "choices" in result and len(result["choices"]) > 0:
        return result["choices"][0]["message"]["content"]
    else:
        return result


def summarize_instagram_post(instagram_url, share_text, download_dir="./downloads"):
    """
    Download and summarize an Instagram post or reel from URL.

    Args:
        instagram_url: Instagram post or reel URL
        download_dir: Directory to save downloaded content

    Returns:
        Dictionary containing video path and summary
    """
    print(f"Downloading from: {instagram_url}")

    os.makedirs("summarization", exist_ok=True)

    shortcode = extract_shortcode_from_url(instagram_url)

    if not os.path.exists(f"summarization/{shortcode}.md"):
        # Download the post/reel
        (is_video, files) = download_instagram_post(shortcode, download_dir)

        # Summarize the video
        print(f"Analyzing {"video" if is_video else "image"} content...")

        summary = summarize_video(files[0]) if is_video else summarize_image(files)

        with open(f"summarization/{shortcode}.md", "w", encoding="utf-8") as f:
            share_text = re.sub(r'\\u[0-9a-fA-F]{4}', '', share_text)
            f.write(f"# {share_text}\n\n{summary}")

        print(f"Summarization saved to summarization/{shortcode}.md")
    else:
        print(f"Summarization already exists for {shortcode}")


# Get messages data from Exporting group chat in Instagram where you share interesting reels into it.

if __name__ == "__main__":
    links_list = []

    with open("message_1.json", "r") as f:
        data = json.load(f)
        for message in data["messages"]:
            if "share" in message:
                timestamp_ms = message["timestamp_ms"]

                message_date = datetime.fromtimestamp(timestamp_ms / 1000)

                links_list.append({
                    "link": message["share"]["link"],
                    "share_text": message["share"]["share_text"]
                })

    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        for link in links_list:
            executor.submit(summarize_instagram_post, link["link"], link["share_text"])