Skip to content

Commit

Permalink
replaced greedy regex match with simple startswith
Browse files Browse the repository at this point in the history
  • Loading branch information
emcf committed Sep 2, 2024
1 parent 535e45c commit 55a224c
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 9 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def read_git_requirements(file):

setup(
name='thepipe_api',
version='1.2.2',
version='1.2.3',
author='Emmett McFarlane',
author_email='[email protected]',
description='AI-native extractor, powered by multimodal LLMs.',
Expand Down
23 changes: 15 additions & 8 deletions thepipe/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
TWITTER_DOMAINS = ['https://twitter.com', 'https://www.twitter.com', 'https://x.com', 'https://www.x.com']
YOUTUBE_DOMAINS = ['https://www.youtube.com', 'https://youtube.com']
GITHUB_DOMAINS = ['https://github.com', 'https://www.github.com']
EXTRACTION_PROMPT = os.getenv("EXTRACTION_PROMPT", """An open source document is given. Output the entire extracted contents from the document in detailed markdown format.
SCRAPING_PROMPT = os.getenv("EXTRACTION_PROMPT", """An open source document is given. Output the entire extracted contents from the document in detailed markdown format.
Be sure to correctly format markdown for headers, paragraphs, lists, tables, menus, equations, full text contents, etc.
Always reply immediately with only markdown. Do not output anything else.""")
DEFAULT_AI_MODEL = os.getenv("DEFAULT_AI_MODEL", "gpt-4o-mini")
Expand Down Expand Up @@ -183,7 +183,7 @@ def process_page(page_num):
"role": "user",
"content": [
{"type": "image_url", "image_url": make_image_url(image, host_images=HOST_IMAGES)},
{"type": "text", "text": f"```{text}```\n{EXTRACTION_PROMPT}"},
{"type": "text", "text": f"```{text}```\n{SCRAPING_PROMPT}"},
]
},
]
Expand All @@ -193,10 +193,17 @@ def process_page(page_num):
temperature=0.2
)
try:
llm_response = response.choices[0].message.content
markdown_match = re.search(r"```markdown(.*?)```", llm_response, re.DOTALL)
if markdown_match:
llm_response = markdown_match.group(1).strip()
llm_response = response.choices[0].message.content.strip()

# remove markdown codeboxes if they are present
if llm_response.startswith("```markdown"):
llm_response = llm_response[len("```markdown"):]
elif llm_response.startswith("```"):
llm_response = llm_response[len("```"):]
if llm_response.endswith("```"):
llm_response = llm_response[:-len("```")]
llm_response = llm_response.strip()

return page_num, llm_response, image
except Exception as e:
raise ValueError(f"{e} (unable to read LLM response: {response})")
Expand Down Expand Up @@ -291,9 +298,9 @@ def scrape_spreadsheet(file_path: str, source_type: str) -> List[Chunk]:

def ai_extract_webpage_content(url: str, text_only: Optional[bool] = False, verbose: Optional[bool] = False, ai_model: Optional[str] = DEFAULT_AI_MODEL) -> Chunk:
from playwright.sync_api import sync_playwright
import modal
from openai import OpenAI

#import modal
#app_name = "scrape-ui"
#function_name = "get_ui_layout_preds"
#fn = modal.Function.lookup(app_name, function_name)
Expand Down Expand Up @@ -347,7 +354,7 @@ def ai_extract_webpage_content(url: str, text_only: Optional[bool] = False, verb
"role": "user",
"content": [
{"type": "image_url", "image_url": make_image_url(stacked_image, host_images=HOST_IMAGES)},
{"type": "text", "text": EXTRACTION_PROMPT},
{"type": "text", "text": SCRAPING_PROMPT},
]
},
]
Expand Down

0 comments on commit 55a224c

Please sign in to comment.