Usage Guide

Installation

Install the package using pip:

pip install scrapery

HTML Example

from scrapery import *

html_content = """
<html>
    <body>
        <h1>Welcome</h1>
        <p>Hello<br>World</p>
        <a href="/about">About Us</a>
        <img src="/images/logo.png">
        <table>
            <tr><th>Name</th><th>Age</th></tr>
            <tr><td>John</td><td>30</td></tr>
            <tr><td>Jane</td><td>25</td></tr>
        </table>
    </body>
</html>
"""

# Parse HTML content
html_doc = parse_html(html_content)

# Pretty print HTML
print(prettify(html_doc))

# Get all table rows
rows = select_all(html_doc, "table tr")
for row in rows:
    print(selector_content(row))

# Get first paragraph
paragraph = select_one(html_doc, "p")
print("Paragraph:", selector_content(paragraph))

# CSS and XPath Selectors
print(selector_content(html_doc, selector="h1"))        # CSS
print(selector_content(html_doc, selector="//h1"))      # XPath
print(selector_content(html_doc, selector="a", attr="href"))  # CSS attr
print(selector_content(html_doc, selector="//a", attr="href"))  # XPath attr

# Get specific table cells
print(selector_content(html_doc, selector="td"))          # First <td>
print(selector_content(html_doc, selector="//td[2]"))     # Second <td>
print(selector_content(html_doc, selector="//tr[3]/td[2]"))  # Jane's age

# Full text content
print(selector_content(html_doc))

# Root attribute
print(selector_content(html_doc, attr="lang"))

Embedded Data

html_content = """
<html>
<head>
  <script>
    window.__INITIAL_STATE__ = {
      "user": {"id": 1, "name": "Alice"},
      "isLoggedIn": true
    };
  </script>
</head>
</html>
"""

json_data = embedded_json(page_source=html_content, start_keyword="window.__INITIAL_STATE__ =")
print(json_data)

html_with_ldjson = """
<html>
  <head>
    <script type="application/ld+json">
      {
        "@context": "http://schema.org",
        "@type": "Person",
        "name": "Alice"
      }
    </script>
  </head>
</html>
"""

ld_json = embedded_json(page_source=html_with_ldjson, selector = "[type*='application/ld+json']")
print(ld_json)

DOM Navigation

p_elem = select_one(html_doc, "p")
print("Parent:", parent(p_elem).tag)
print("Children:", [c.tag for c in children(p_elem)])
print("Siblings:", [s.tag for s in siblings(p_elem)])

print("Next sibling of <p>:", next_sibling(p_elem).tag)
h1_elem = select_one(html_doc, "h1")
print("Previous sibling of <h1>:", next_sibling(h1_elem))

ancs = ancestors(p_elem)
print("Ancestors:", [a.tag for a in ancs])
desc = descendants(select_one(html_doc, "table"))
print("Descendants:", [d.tag for d in desc])

Class Utilities

div_html = '<div class="card primary"></div>'
div_elem = parse_html(div_html)
print("Has class 'card'? ->", has_class(div_elem, "card"))
print("Classes:", get_classes(div_elem))

Resolve Relative URLs

base = "https://example.com"
print(absolute_url(html_doc, "a", base_url=base))
print(absolute_url(html_doc, "img", base_url=base, attr="src"))

XML Example

xml_content = "<root><child>Test</child></root>"
xml_doc = parse_xml(xml_content)
print(prettify(xml_doc))

all_elements = select_all(xml_doc, "child")
print(all_elements)

child = select_one(xml_doc, "//child")
print(child)

print(selector_content(xml_doc, "child"))
print(parent(child))
print(children(xml_doc))

print(xml_find(xml_doc, "child"))
print(xml_find_all(xml_doc, "child"))

print(xml_xpath(xml_doc, "//child"))

xslt = """<?xml version="1.0"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
    <xsl:template match="/">
        <html><body><xsl:value-of select="/root/child"/></body></html>
    </xsl:template>
</xsl:stylesheet>"""

transformed = xml_transform(xml_doc, xslt)
print(prettify(transformed))

# Validate (requires schema file)
is_valid = xml_validate_xsd(xml_doc, Path("schema.xsd"))
print(is_valid)

new_element = xml_create_element("newTag", text="This is new", id="123")
xml_add_child(xml_doc, new_element)
xml_set_attr(new_element, "id", "456")
print(prettify(xml_doc))

JSON Example

json_str = '{"user": {"profile": {"name": "Alice"}}}'
print(json_content(json_str, keys=["name"], position="first"))
print(json_content(json_str, keys=["user", "profile", "name"], position="last"))

Useful Utilities

Create a Directory

from scrapery import create_directory

create_directory("new_folder")
create_directory("parent_folder/sub_folder")

Standardize a String

from scrapery import standardized_string
# This function standardizes the input string by removing escape sequences like \n, \t, and \r, removing HTML tags, collapsing multiple spaces, and trimming leading/trailing spaces.

# Example 1: Standardize a string with newlines, tabs, and HTML tags
input_string_1 = "<html><body>  Hello \nWorld!  \tThis is a test.  </body></html>"
print("Standardized String 1:", standardized_string(input_string_1))

# Example 2: Input string with multiple spaces and line breaks
input_string_2 = "  This   is   a  \n\n   string   with  spaces and \t tabs.  "
print("Standardized String 2:", standardized_string(input_string_2))

# Example 3: Pass an empty string
input_string_3 = ""
print("Standardized String 3:", standardized_string(input_string_3))

# Example 4: Pass None (invalid input)
input_string_4 = None
print("Standardized String 4:", standardized_string(input_string_4))

Replace a String

from scrapery import replace_content

text = "posting posting posting"

# Example 1: Replace all occurrences
result = replace_content(text, "posting", "UPDATED")
print(result)
# Output: "UPDATED UPDATED UPDATED"

# Example 2: Replace only the 2nd occurrence (position)
result = replace_content(text, "posting", "UPDATED", position=2)
print(result)
# Output: "posting UPDATED posting"

# Example 3: Case-insensitive replacement
text = "Posting POSTING posting"
result = replace_content(text, "posting", "edited", ignore_case=True, position=2)
print(result)
# Output: "Posting edited posting"

# Example 4: Limit number of replacements (count)
text = "apple apple apple"
result = replace_content(text, "apple", "orange", count=2)
print(result)
# Output: "orange orange apple"

# Example 5: Replace in a file

# example.txt contains: "error error error"
replace_content("example.txt", "error", "warning", ignore_case=True)
# The file now contains: "warning warning warning"

Read CSV

from scrapery import read_csv

result = read_csv('data.csv', 'URL', 'Category', ['Tech'])
print(result)

Save to CSV

from scrapery import save_to_csv

list_data = [[1, 'Alice', 23], [2, 'Bob', 30], [3, 'Charlie', 25]]
headers = ['ID', 'Name', 'Age']
output_file_path = 'output.csv'
save_to_csv(list_data, headers, output_file_path)

Save to Excel

from scrapery import save_to_xls

save_to_xls(list_data, headers, 'output.xlsx')

Save to Sqlite Database

from scrapery import save_to_db

#Creates a SQLite database file named data.sqlite in the current folder and adds a table called data.
save_to_db(data_list, headers)

#Creates a SQLite database file named mydb.sqlite in the given folder (report) and adds a table called User.
save_to_db(data_list, headers, auto_data_type=False, output_file_path="report/mydb.sqlite", table_name="User")

List Files in a Directory

from scrapery import list_files

files = list_files(directory="output", extension="csv")
print(files)

Read File Content

from scrapery import read_file_content

small_json = read_file_content("small.json", stream_json=False)
print(small_json)

stream = read_file_content("large.json", stream_json=True)
for obj in stream:
    print(obj)

text = read_file_content("large_text.txt")
print(text[:500])

Save to File

from scrapery import save_file_content

save_file_content("output.txt", "Hello World")
save_file_content("data.json", {"name": "Alice"})
save_file_content("append.txt", "\nAnother line", mode="a")

Send mail

from scrapery import send_email

smtp_server = "smtp.gmail.com"  # For Gmail, change if using other services
sender_email = "your_email@gmail.com"  # Replace with the sender's email address
sender_passwd = "your_email_password"  # Replace with the sender's email password (consider using OAuth for security)
to_addrs = ["recipient1@example.com", "recipient2@example.com"]  # List of recipient email addresses
subject = "Test Email with Attachments"
smtp_port = 465  # SMTP port for Gmail SSL
text_body = "Hello, this is a test email."
html_body = "<html><body><h1>Hello, this is a <i>test</i> email.</h1></body></html>"
cc_addrs = ["cc_recipient@example.com"]  # Optional: list of CC recipients
bcc_addrs = ["bcc_recipient@example.com"]  # Optional: list of BCC recipients
attachments = [Path("/path/to/file1.pdf"), Path("/path/to/image.png")]  # Optional: list of file paths to attach

# Call the send_email function
success, message = send_email(
    smtp_server=smtp_server,
    sender_email=sender_email,
    sender_passwd=sender_passwd,
    to_addrs=to_addrs,
    subject=subject,
    smtp_port=smtp_port,
    text_body=text_body,
    html_body=html_body,
    cc_addrs=cc_addrs,
    bcc_addrs=bcc_addrs,
    attachments=attachments
)

# Print the result
print(f"Success: {success}")
print(f"Message: {message}")