From e021d029a02f50345ce2db3e36713a29daf0e5aa Mon Sep 17 00:00:00 2001 From: Matthew Giblett Date: Thu, 31 Oct 2024 15:56:19 +1000 Subject: [PATCH] feat: search multiple engines; added argument to return only unique urls. --- serps/cli.py | 11 ++++++-- serps/constants.py | 9 ++++++- serps/main.py | 64 ++++++++++++++++++++++++++++++---------------- 3 files changed, 59 insertions(+), 25 deletions(-) diff --git a/serps/cli.py b/serps/cli.py index 3e79867..d0b74bc 100644 --- a/serps/cli.py +++ b/serps/cli.py @@ -18,7 +18,7 @@ QUERIES_PATH, RESULTS_PATH, ) -from serps.main import load_list, request_scrape, save_excel, save_yaml +from serps.main import get_unique, load_list, request_scrape, save_excel, save_yaml def version_msg() -> str: @@ -67,8 +67,9 @@ def auth(ctx, username: str, password: str) -> None: @cli.command(help="Scrape lists.") @click.argument("lists", nargs=-1, required=True) +@click.option("-u", "--unique", is_flag=True, help="Return unique.") @click.pass_context -def scrape(ctx, lists: tuple[str]) -> None: +def scrape(ctx, lists: tuple[str], unique: bool) -> None: auth = ctx.obj[API_USERNAME], ctx.obj[API_PASSWORD] df = DataFrame(columns=DATAFRAME_COLUMNS) for l in lists: @@ -83,6 +84,12 @@ def scrape(ctx, lists: tuple[str]) -> None: conf_path = ctx.obj[RESULTS_PATH] file_path = f"{conf_path}{datetime.now().strftime("%d-%m-%Y-%H-%M-%S")}.xlsx" save_excel(file_path, df) + if unique: + df_unique = get_unique(df) + file_path = ( + f"{conf_path}{datetime.now().strftime("%d-%m-%Y-%H-%M-%S")}_unique.xlsx" + ) + save_excel(file_path, df_unique) @cli.command(help="Add query to specified list.") diff --git a/serps/constants.py b/serps/constants.py index 57b88c0..3cc267b 100644 --- a/serps/constants.py +++ b/serps/constants.py @@ -47,5 +47,12 @@ # other -DATAFRAME_COLUMNS = ["Query", "Page", "Position", "URL", "Title", "Description"] +DATAFRAME_COLUMNS = [ + "Source", + "Query", + "Page", + "Position", + "URL", + "Title", +] QUERIES_FILETYPE = ".yaml" diff --git a/serps/main.py b/serps/main.py index 3a337cf..2545e15 100644 --- a/serps/main.py +++ b/serps/main.py @@ -47,31 +47,51 @@ def load_list(file_path: Path | str) -> dict[str, Any] | None: def request_scrape(auth: tuple[str, str], payload: dict[str, Any]) -> DataFrame: + global df df = DataFrame(columns=DATAFRAME_COLUMNS) + df.index = df.index + 1 queries = payload["queries"] payload.pop("queries") - for query in queries: - payload["query"] = query - response = requests.request( - method="POST", - url=REQUEST_URL, - auth=auth, - json=payload, - ) - pages = response.json().get("results", []) - results = [] - for page, data in enumerate(pages): - for result in data["content"]["results"]["organic"]: - results.append( - { - "Query": query, - "Page": page + 1, - "Position": result["pos"], - "URL": result["url"], - "Title": result["title"], - "Description": result["desc"], - } - ) + + def query_loop(): + global df + for query in queries: + print(f"Searching {query} from {payload["source"]}...") + payload["query"] = query + response = requests.request( + method="POST", + url=REQUEST_URL, + auth=auth, + json=payload, + ) + pages = response.json().get("results", []) + results = [] + for page, data in enumerate(pages): + for result in data["content"]["results"]["organic"]: + results.append( + { + "Source": payload["source"], + "Query": query, + "Page": page + 1, + "Position": result["pos"], + "URL": result["url"], + "Title": result["title"], + } + ) df = concat([df, DataFrame(results)], ignore_index=True) + if type(payload["source"]) is tuple: + sources = payload["source"] + payload.pop("source") + for source in sources: + payload["source"] = source + query_loop() + else: + query_loop() + + return df + + +def get_unique(df: DataFrame) -> DataFrame: + df = df.drop_duplicates(subset="URL") return df