Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 1 addition & 35 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 5 additions & 4 deletions pydoll/elements/web_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import Optional

import aiofiles
from bs4 import BeautifulSoup

from pydoll.commands import (
DomCommands,
Expand Down Expand Up @@ -34,7 +33,10 @@
from pydoll.protocol.dom.types import Quad
from pydoll.protocol.page.responses import CaptureScreenshotResponse
from pydoll.protocol.page.types import Viewport
from pydoll.utils import decode_base64_to_bytes
from pydoll.utils import (
decode_base64_to_bytes,
extract_text_from_html,
)


class WebElement(FindElementsMixin): # noqa: PLR0904
Expand Down Expand Up @@ -99,8 +101,7 @@ def is_enabled(self) -> bool:
async def text(self) -> str:
"""Visible text content of the element."""
outer_html = await self.inner_html
soup = BeautifulSoup(outer_html, 'html.parser')
return soup.get_text(strip=True)
return extract_text_from_html(outer_html, strip=True)

@property
async def bounds(self) -> Quad:
Expand Down
90 changes: 90 additions & 0 deletions pydoll/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import logging
import os
import re
from html import unescape
from html.parser import HTMLParser

import aiohttp

Expand All @@ -10,6 +12,94 @@
logger = logging.getLogger(__name__)


class TextExtractor(HTMLParser):
"""
HTML parser for text extraction.

Extracts visible text content from an HTML string, excluding the contents of
tags specified in _skip_tags.
"""
def __init__(self):
super().__init__()
self._parts = []
self._skip = False
self._skip_tags = {"script", "style", "template"}

def handle_starttag(self, tag, attrs):
"""
Marks the parser to skip content inside tags specified in _skip_tags.

Args:
tag (str): The tag name.
attrs (list): A list of (attribute, value) pairs.
"""
if tag in self._skip_tags:
self._skip = True

def handle_endtag(self, tag):
"""
Marks the parser the end of skip tags.

Args:
tag (str): The tag name.
"""
if tag in self._skip_tags:
self._skip = False

def handle_data(self, data):
"""
Handles text nodes. Adds them to the result unless they are within a skip tag.

Args:
data (str): The text data.
"""
if not self._skip:
self._parts.append(unescape(data))

def get_strings(self, strip: bool):
"""
Yields all collected visible text fragments.

Args:
strip (bool): Whether to strip leading/trailing whitespace from each fragment.

Yields:
str: Visible text fragments.
"""
for text in self._parts:
yield text.strip() if strip else text

def get_text(self, separator: str, strip: bool) -> str:
"""
Returns all visible text.

Args:
separator (str): String inserted between extracted text fragments.
strip (bool): Whether to strip whitespace from each fragment.

Returns:
str: The visible text.
"""
return separator.join(self.get_strings(strip=strip))


def extract_text_from_html(html: str, separator: str = '', strip: bool = False) -> str:
"""
Extracts visible text content from an HTML string.

Args:
html (str): The HTML string to extract text from.
separator (str, optional): String inserted between extracted text fragments. Defaults to ''.
strip (bool, optional): Whether to strip whitespace from text fragments. Defaults to False.

Returns:
str: The extracted visible text.
"""
parser = TextExtractor()
parser.feed(html)
return parser.get_text(separator=separator, strip=strip)


def decode_base64_to_bytes(image: str) -> bytes:
"""
Decodes a base64 image string to bytes.
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ python = "^3.10"
websockets = "^13.1"
aiohttp = "^3.9.5"
aiofiles = "^23.2.1"
beautifulsoup4 = "^4.12.3"
mkdocstrings = "^0.29.1"


Expand Down
24 changes: 24 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
has_return_outside_function,
is_script_already_function,
validate_browser_paths,
extract_text_from_html,
)


Expand Down Expand Up @@ -401,3 +402,26 @@ def test_has_return_outside_function_arrow_function(self):
'''
assert has_return_outside_function(script) is False

def test_extract_text_without_strip_without_separator(self):
html = ('<div>Hello <span> world </span><script>alert(1)</script><style>body { color: red; }</style>'
'<template>hidden</template></div>')
result = extract_text_from_html(html)
assert result == 'Hello world '

def test_extract_text_with_strip_without_separator(self):
html = ('<div>Hello <span> world </span><script>alert(1)</script><style>body { color: red; }</style>'
'<template>hidden</template></div>')
result = extract_text_from_html(html, strip=True)
assert result == 'Helloworld'

def test_extract_text_without_strip_with_separator(self):
html = ('<div>Hello <span> world </span><script>alert(1)</script><style>body { color: red; }</style>'
'<template>hidden</template></div>')
result = extract_text_from_html(html, separator="/")
assert result == 'Hello / world '

def test_extract_text_with_strip_with_separator(self):
html = ('<div>Hello <span> world </span><script>alert(1)</script><style>body { color: red; }</style>'
'<template>hidden</template></div>')
result = extract_text_from_html(html, strip=True, separator="/")
assert result == 'Hello/world'
Loading