Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,7 @@ miner/openai/
miner/best_miner/
output/
results/
result_processed/
.idea/
.env
weights_tracking/
48 changes: 46 additions & 2 deletions validator/snippet_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def _get_browser_headers(self, url: str = None, referer: str = None) -> dict:
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": accept_language, # Randomized
"Accept-Encoding": "gzip, deflate, br",
"Accept-Encoding": "gzip, deflate", # Removed 'br' (Brotli) - some servers may not handle it correctly
"DNT": "1", # Do Not Track
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
Expand Down Expand Up @@ -506,8 +506,52 @@ async def fetch_entire_page(
bt.logging.error(f"{request_id} | {miner_uid} | {url} | Error occurred | Returning empty html : {response}")
return ""

# Log response headers to debug compression/encoding issues
content_encoding = response.headers.get("content-encoding", "none")
content_type = response.headers.get("content-type", "none")
bt.logging.info(
f"{request_id} | {miner_uid} | {url} | "
f"Response headers - Content-Encoding: {content_encoding}, Content-Type: {content_type}"
)

# Ensure we're getting text, not binary
# httpx should auto-decompress, but let's verify
try:
html_content = response.text

# Check if response looks like binary/compressed data
if len(html_content) > 0:
# Check first few bytes to see if it's binary
first_bytes = html_content[:100]
# If it contains a lot of non-printable characters, it might be binary
non_printable = sum(1 for c in first_bytes if ord(c) < 32 and c not in '\n\r\t\f')
if len(first_bytes) > 0 and (non_printable / len(first_bytes)) > 0.3:
bt.logging.error(
f"{request_id} | {miner_uid} | {url} | "
f"Response appears to be binary/compressed. "
f"Content-Encoding: {content_encoding}, "
f"Non-printable ratio: {non_printable/len(first_bytes):.2%}. "
f"First 50 bytes (repr): {repr(first_bytes[:50])}"
)
# Try to get raw content and manually decode if needed
try:
raw_content = response.content
# Try UTF-8 first
html_content = raw_content.decode('utf-8', errors='replace')
bt.logging.warning(f"{request_id} | {miner_uid} | {url} | Manually decoded response as UTF-8")
except Exception as decode_error:
bt.logging.error(f"{request_id} | {miner_uid} | {url} | Failed to decode response: {decode_error}")
return ""

except Exception as e:
bt.logging.error(
f"{request_id} | {miner_uid} | {url} | "
f"Failed to get response text: {e}"
)
return ""

cleaned_html: str = await self.clean_html(
request_id, miner_uid, url, response.text
request_id, miner_uid, url, html_content
)

duration = time.perf_counter() - start
Expand Down
Loading