-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapeV2.py
54 lines (41 loc) · 1.67 KB
/
scrapeV2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import requests
def get_all_pages_from_wiki(language_code, output_file):
base_url = f"https://{language_code}.wikipedia.org/w/api.php"
params = {
"action": "query",
"format": "json",
"list": "allpages",
"aplimit": "max", # Retrieves maximum number of pages per request
}
all_pages = set()
continue_param = None
total_pages = 0
target_pages = 100000 # Change to desired target
while total_pages < target_pages:
if continue_param:
params["apcontinue"] = continue_param
response = requests.get(base_url, params=params)
data = response.json()
for page in data["query"]["allpages"]:
# Check if 'title' exists in the dictionary before adding
if "title" in page:
# Construct full URL and add to the set
full_url = f"https://{language_code}.wikipedia.org/wiki/{page['title'].replace(' ', '_')}"
all_pages.add(full_url)
total_pages += 1
if total_pages >= target_pages:
break
print(
f"Pages retrieved: {total_pages}/{target_pages}", end="\r"
) # Update progress
if "continue" in data and total_pages < target_pages:
continue_param = data["continue"]["apcontinue"]
else:
break
# Write URLs to output file
with open(output_file, "w", encoding="utf-8") as file:
for url in all_pages:
file.write(url + "\n")
print(f"\nStopped at {target_pages} pages. URLs saved to file.")
# Replace 'sv' with the appropriate language code
get_all_pages_from_wiki("sv", "sv_wiki_links.txt")