Some modification to keep only some selectors when loading documents and to bypass human verification from some websites

pull/21911/head
Kev744 2 weeks ago committed by GitHub
parent 242eeb537f
commit 41da354d12
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -59,7 +59,7 @@ class PlaywrightEvaluator(ABC):
class UnstructuredHtmlEvaluator(PlaywrightEvaluator):
"""Evaluate the page HTML content using the `unstructured` library."""
def __init__(self, remove_selectors: Optional[List[str]] = None):
def __init__(self, remove_selectors: Optional[List[str]] = None, keep_only_selectors: Optional[List[str]] = None):
"""Initialize UnstructuredHtmlEvaluator."""
try:
import unstructured # noqa:F401
@ -70,6 +70,7 @@ class UnstructuredHtmlEvaluator(PlaywrightEvaluator):
)
self.remove_selectors = remove_selectors
self.keep_only_selectors = keep_only_selectors
def evaluate(self, page: "Page", browser: "Browser", response: "Response") -> str:
"""Synchronously process the HTML content of the page."""
@ -80,6 +81,15 @@ class UnstructuredHtmlEvaluator(PlaywrightEvaluator):
for element in elements:
if element.is_visible():
element.evaluate("element => element.remove()")
selectors = ','.join(self.keep_only_selectors) or 'body'
outer_html = page.evaluate("(selectors) => [...document.querySelectorAll(selectors)]"
".map(node => node.outerHTML)"
".join('')", selectors)
page.evaluate(
"""(html) => { document.querySelector('body').innerHTML = html }""",
outer_html
)
page_source = page.content()
elements = partition_html(text=page_source)
@ -97,6 +107,14 @@ class UnstructuredHtmlEvaluator(PlaywrightEvaluator):
if await element.is_visible():
await element.evaluate("element => element.remove()")
selectors = ','.join(self.keep_only_selectors) or 'body'
outer_html = await page.evaluate("(selectors) => [...document.querySelectorAll(selectors)]"
".map(node => node.outerHTML)"
".join('')", selectors)
await page.evaluate(
"""(html) => { document.querySelector('body').innerHTML = html }""",
outer_html
)
page_source = await page.content()
elements = partition_html(text=page_source)
return "\n\n".join([str(el) for el in elements])
@ -135,8 +153,11 @@ class PlaywrightURLLoader(BaseLoader):
continue_on_failure: bool = True,
headless: bool = True,
remove_selectors: Optional[List[str]] = None,
keep_only_selectors: Optional[List[str]] = None,
evaluator: Optional[PlaywrightEvaluator] = None,
proxy: Optional[Dict[str, str]] = None,
header: Optional[str] = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/58.0.3029.110 Safari/537.36')
):
"""Load a list of URLs using Playwright."""
try:
@ -151,14 +172,15 @@ class PlaywrightURLLoader(BaseLoader):
self.continue_on_failure = continue_on_failure
self.headless = headless
self.proxy = proxy
self.header = header
if remove_selectors and evaluator:
if remove_selectors or keep_only_selectors and evaluator:
raise ValueError(
"`remove_selectors` and `evaluator` cannot be both not None"
"`remove_selectors or keep_only_selectors` and `evaluator` cannot be both not None"
)
# Use the provided evaluator, if any, otherwise, use the default.
self.evaluator = evaluator or UnstructuredHtmlEvaluator(remove_selectors)
self.evaluator = evaluator or UnstructuredHtmlEvaluator(remove_selectors, keep_only_selectors)
def lazy_load(self) -> Iterator[Document]:
"""Load the specified URLs using Playwright and create Document instances.
@ -172,7 +194,8 @@ class PlaywrightURLLoader(BaseLoader):
browser = p.chromium.launch(headless=self.headless, proxy=self.proxy)
for url in self.urls:
try:
page = browser.new_page()
context = browser.new_context(user_agent=self.header)
page = context.new_page()
response = page.goto(url)
if response is None:
raise ValueError(f"page.goto() returned None for url {url}")
@ -211,7 +234,8 @@ class PlaywrightURLLoader(BaseLoader):
browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy)
for url in self.urls:
try:
page = await browser.new_page()
context = await browser.new_context(user_agent=self.header)
page = await context.new_page()
response = await page.goto(url)
if response is None:
raise ValueError(f"page.goto() returned None for url {url}")

Loading…
Cancel
Save