Skip to content

Commit

Permalink
feat: search multiple engines; added argument to return only unique
Browse files Browse the repository at this point in the history
urls.
  • Loading branch information
mjgiblett committed Oct 31, 2024
1 parent 1333fa9 commit e021d02
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 25 deletions.
11 changes: 9 additions & 2 deletions serps/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
QUERIES_PATH,
RESULTS_PATH,
)
from serps.main import load_list, request_scrape, save_excel, save_yaml
from serps.main import get_unique, load_list, request_scrape, save_excel, save_yaml


def version_msg() -> str:
Expand Down Expand Up @@ -67,8 +67,9 @@ def auth(ctx, username: str, password: str) -> None:

@cli.command(help="Scrape lists.")
@click.argument("lists", nargs=-1, required=True)
@click.option("-u", "--unique", is_flag=True, help="Return unique.")
@click.pass_context
def scrape(ctx, lists: tuple[str]) -> None:
def scrape(ctx, lists: tuple[str], unique: bool) -> None:
auth = ctx.obj[API_USERNAME], ctx.obj[API_PASSWORD]
df = DataFrame(columns=DATAFRAME_COLUMNS)
for l in lists:

Check failure on line 75 in serps/cli.py

View workflow job for this annotation

GitHub Actions / build

Ruff (E741)

serps/cli.py:75:9: E741 Ambiguous variable name: `l`
Expand All @@ -83,6 +84,12 @@ def scrape(ctx, lists: tuple[str]) -> None:
conf_path = ctx.obj[RESULTS_PATH]
file_path = f"{conf_path}{datetime.now().strftime("%d-%m-%Y-%H-%M-%S")}.xlsx"
save_excel(file_path, df)
if unique:
df_unique = get_unique(df)
file_path = (
f"{conf_path}{datetime.now().strftime("%d-%m-%Y-%H-%M-%S")}_unique.xlsx"
)
save_excel(file_path, df_unique)


@cli.command(help="Add query to specified list.")
Expand Down
9 changes: 8 additions & 1 deletion serps/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,12 @@

# other

DATAFRAME_COLUMNS = ["Query", "Page", "Position", "URL", "Title", "Description"]
DATAFRAME_COLUMNS = [
"Source",
"Query",
"Page",
"Position",
"URL",
"Title",
]
QUERIES_FILETYPE = ".yaml"
64 changes: 42 additions & 22 deletions serps/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,31 +47,51 @@ def load_list(file_path: Path | str) -> dict[str, Any] | None:


def request_scrape(auth: tuple[str, str], payload: dict[str, Any]) -> DataFrame:
global df
df = DataFrame(columns=DATAFRAME_COLUMNS)
df.index = df.index + 1
queries = payload["queries"]
payload.pop("queries")
for query in queries:
payload["query"] = query
response = requests.request(
method="POST",
url=REQUEST_URL,
auth=auth,
json=payload,
)
pages = response.json().get("results", [])
results = []
for page, data in enumerate(pages):
for result in data["content"]["results"]["organic"]:
results.append(
{
"Query": query,
"Page": page + 1,
"Position": result["pos"],
"URL": result["url"],
"Title": result["title"],
"Description": result["desc"],
}
)

def query_loop():
global df
for query in queries:
print(f"Searching {query} from {payload["source"]}...")
payload["query"] = query
response = requests.request(
method="POST",
url=REQUEST_URL,
auth=auth,
json=payload,
)
pages = response.json().get("results", [])
results = []
for page, data in enumerate(pages):
for result in data["content"]["results"]["organic"]:
results.append(
{
"Source": payload["source"],
"Query": query,
"Page": page + 1,
"Position": result["pos"],
"URL": result["url"],
"Title": result["title"],
}
)
df = concat([df, DataFrame(results)], ignore_index=True)

if type(payload["source"]) is tuple:
sources = payload["source"]
payload.pop("source")
for source in sources:
payload["source"] = source
query_loop()
else:
query_loop()

return df


def get_unique(df: DataFrame) -> DataFrame:
df = df.drop_duplicates(subset="URL")
return df

0 comments on commit e021d02

Please sign in to comment.