Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code Refactor for Speed and Readability #32

Merged
merged 1 commit into from
Jun 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 8 additions & 9 deletions beautiful_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def download_uri(uri, dir="./"):

def download_baidu(word):
"""Downloads images from Baidu based on a search word, saving them with a specific naming convention."""
url = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=" + word + "&ct=201326592&v=flip"
url = f"https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={word}&ct=201326592&v=flip"
pic_url = re.findall('"objURL":"(.*?)",', requests.get(url).text, re.S)

i = 0
Expand All @@ -26,24 +26,23 @@ def download_baidu(word):
print("exception")
continue

string = "pictures" + word + "_" + str(i) + ".jpg"
fp = open(string, "wb")
fp.write(pic.content)
fp.close()
string = f"pictures{word}_{str(i)}.jpg"
with open(string, "wb") as fp:
fp.write(pic.content)
i += 1


def download_google(word):
"""Downloads images from Bing for a given search word by scraping image links and using curl to download."""

# url = 'https://www.google.com/search?q=' + word + '&client=opera&hs=cTQ&source=lnms&tbm=isch&sa=X&ved=0ahUKEwig3LOx4PzKAhWGFywKHZyZAAgQ_AUIBygB&biw=1920&bih=982'
url = "https://www.bing.com/images/search?q=" + word
url = f"https://www.bing.com/images/search?q={word}"
soup = BeautifulSoup(requests.get(url).text, "html.parser")
links = soup.find_all("a", {"class": "thumb"})

for link in links:
link = link.get("href")
s = "curl -s -L -o '%s' '%s'" % (link.split("/")[-1], link)
s = f"""curl -s -L -o '{link.split("/")[-1]}' '{link}'"""
os.system(s)


Expand All @@ -60,7 +59,7 @@ def get_html():
link = url + link.get("href")
f = dir + link.split("/")[-1]
if not os.path.exists(f):
s = "curl -s -L -o '%s' '%s'" % (f, link)
s = f"curl -s -L -o '{f}' '{link}'"
os.system(s)


Expand All @@ -75,7 +74,7 @@ def organize_folders():
link = url + link.get("href")
f = dir + link.split("/")[-1]
if not os.path.exists(f):
s = "curl -s -L -o '%s' '%s'" % (f, link)
s = f"curl -s -L -o '{f}' '{link}'"
os.system(s)


Expand Down
Loading