Usage Guide
Installation
Install the package using pip:
pip install scrapery
HTML Example
from scrapery import *
html_content = """
<html>
<body>
<h1>Welcome</h1>
<p>Hello<br>World</p>
<a href="/about">About Us</a>
<img src="/images/logo.png">
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>John</td><td>30</td></tr>
<tr><td>Jane</td><td>25</td></tr>
</table>
</body>
</html>
"""
# Parse HTML content
html_doc = parse_html(html_content)
# Pretty print HTML
print(prettify(html_doc))
# Get all table rows
rows = select_all(html_doc, "table tr")
for row in rows:
print(selector_content(row))
# Get first paragraph
paragraph = select_one(html_doc, "p")
print("Paragraph:", selector_content(paragraph))
# CSS and XPath Selectors
print(selector_content(html_doc, selector="h1")) # CSS
print(selector_content(html_doc, selector="//h1")) # XPath
print(selector_content(html_doc, selector="a", attr="href")) # CSS attr
print(selector_content(html_doc, selector="//a", attr="href")) # XPath attr
# Get specific table cells
print(selector_content(html_doc, selector="td")) # First <td>
print(selector_content(html_doc, selector="//td[2]")) # Second <td>
print(selector_content(html_doc, selector="//tr[3]/td[2]")) # Jane's age
# Full text content
print(selector_content(html_doc))
# Root attribute
print(selector_content(html_doc, attr="lang"))
Embedded Data
html_content = """
<html>
<head>
<script>
window.__INITIAL_STATE__ = {
"user": {"id": 1, "name": "Alice"},
"isLoggedIn": true
};
</script>
</head>
</html>
"""
json_data = embedded_json(page_source=html_content, start_keyword="window.__INITIAL_STATE__ =")
print(json_data)
html_with_ldjson = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "http://schema.org",
"@type": "Person",
"name": "Alice"
}
</script>
</head>
</html>
"""
ld_json = embedded_json(page_source=html_with_ldjson, selector = "[type*='application/ld+json']")
print(ld_json)
Class Utilities
div_html = '<div class="card primary"></div>'
div_elem = parse_html(div_html)
print("Has class 'card'? ->", has_class(div_elem, "card"))
print("Classes:", get_classes(div_elem))
Resolve Relative URLs
base = "https://example.com"
print(absolute_url(html_doc, "a", base_url=base))
print(absolute_url(html_doc, "img", base_url=base, attr="src"))
XML Example
xml_content = "<root><child>Test</child></root>"
xml_doc = parse_xml(xml_content)
print(prettify(xml_doc))
all_elements = select_all(xml_doc, "child")
print(all_elements)
child = select_one(xml_doc, "//child")
print(child)
print(selector_content(xml_doc, "child"))
print(parent(child))
print(children(xml_doc))
print(xml_find(xml_doc, "child"))
print(xml_find_all(xml_doc, "child"))
print(xml_xpath(xml_doc, "//child"))
xslt = """<?xml version="1.0"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:template match="/">
<html><body><xsl:value-of select="/root/child"/></body></html>
</xsl:template>
</xsl:stylesheet>"""
transformed = xml_transform(xml_doc, xslt)
print(prettify(transformed))
# Validate (requires schema file)
is_valid = xml_validate_xsd(xml_doc, Path("schema.xsd"))
print(is_valid)
new_element = xml_create_element("newTag", text="This is new", id="123")
xml_add_child(xml_doc, new_element)
xml_set_attr(new_element, "id", "456")
print(prettify(xml_doc))
JSON Example
json_str = '{"user": {"profile": {"name": "Alice"}}}'
print(json_content(json_str, keys=["name"], position="first"))
print(json_content(json_str, keys=["user", "profile", "name"], position="last"))
Useful Utilities
Create a Directory
from scrapery import create_directory
create_directory("new_folder")
create_directory("parent_folder/sub_folder")
Standardize a String
from scrapery import standardized_string
# This function standardizes the input string by removing escape sequences like \n, \t, and \r, removing HTML tags, collapsing multiple spaces, and trimming leading/trailing spaces.
# Example 1: Standardize a string with newlines, tabs, and HTML tags
input_string_1 = "<html><body> Hello \nWorld! \tThis is a test. </body></html>"
print("Standardized String 1:", standardized_string(input_string_1))
# Example 2: Input string with multiple spaces and line breaks
input_string_2 = " This is a \n\n string with spaces and \t tabs. "
print("Standardized String 2:", standardized_string(input_string_2))
# Example 3: Pass an empty string
input_string_3 = ""
print("Standardized String 3:", standardized_string(input_string_3))
# Example 4: Pass None (invalid input)
input_string_4 = None
print("Standardized String 4:", standardized_string(input_string_4))
Replace a String
from scrapery import replace_content
text = "posting posting posting"
# Example 1: Replace all occurrences
result = replace_content(text, "posting", "UPDATED")
print(result)
# Output: "UPDATED UPDATED UPDATED"
# Example 2: Replace only the 2nd occurrence (position)
result = replace_content(text, "posting", "UPDATED", position=2)
print(result)
# Output: "posting UPDATED posting"
# Example 3: Case-insensitive replacement
text = "Posting POSTING posting"
result = replace_content(text, "posting", "edited", ignore_case=True, position=2)
print(result)
# Output: "Posting edited posting"
# Example 4: Limit number of replacements (count)
text = "apple apple apple"
result = replace_content(text, "apple", "orange", count=2)
print(result)
# Output: "orange orange apple"
# Example 5: Replace in a file
# example.txt contains: "error error error"
replace_content("example.txt", "error", "warning", ignore_case=True)
# The file now contains: "warning warning warning"
Read CSV
from scrapery import read_csv
result = read_csv('data.csv', 'URL', 'Category', ['Tech'])
print(result)
Save to CSV
from scrapery import save_to_csv
list_data = [[1, 'Alice', 23], [2, 'Bob', 30], [3, 'Charlie', 25]]
headers = ['ID', 'Name', 'Age']
output_file_path = 'output.csv'
save_to_csv(list_data, headers, output_file_path)
Save to Excel
from scrapery import save_to_xls
save_to_xls(list_data, headers, 'output.xlsx')
Save to Sqlite Database
from scrapery import save_to_db
#Creates a SQLite database file named data.sqlite in the current folder and adds a table called data.
save_to_db(data_list, headers)
#Creates a SQLite database file named mydb.sqlite in the given folder (report) and adds a table called User.
save_to_db(data_list, headers, auto_data_type=False, output_file_path="report/mydb.sqlite", table_name="User")
List Files in a Directory
from scrapery import list_files
files = list_files(directory="output", extension="csv")
print(files)
Read File Content
from scrapery import read_file_content
small_json = read_file_content("small.json", stream_json=False)
print(small_json)
stream = read_file_content("large.json", stream_json=True)
for obj in stream:
print(obj)
text = read_file_content("large_text.txt")
print(text[:500])
Save to File
from scrapery import save_file_content
save_file_content("output.txt", "Hello World")
save_file_content("data.json", {"name": "Alice"})
save_file_content("append.txt", "\nAnother line", mode="a")
Send mail
from scrapery import send_email
smtp_server = "smtp.gmail.com" # For Gmail, change if using other services
sender_email = "your_email@gmail.com" # Replace with the sender's email address
sender_passwd = "your_email_password" # Replace with the sender's email password (consider using OAuth for security)
to_addrs = ["recipient1@example.com", "recipient2@example.com"] # List of recipient email addresses
subject = "Test Email with Attachments"
smtp_port = 465 # SMTP port for Gmail SSL
text_body = "Hello, this is a test email."
html_body = "<html><body><h1>Hello, this is a <i>test</i> email.</h1></body></html>"
cc_addrs = ["cc_recipient@example.com"] # Optional: list of CC recipients
bcc_addrs = ["bcc_recipient@example.com"] # Optional: list of BCC recipients
attachments = [Path("/path/to/file1.pdf"), Path("/path/to/image.png")] # Optional: list of file paths to attach
# Call the send_email function
success, message = send_email(
smtp_server=smtp_server,
sender_email=sender_email,
sender_passwd=sender_passwd,
to_addrs=to_addrs,
subject=subject,
smtp_port=smtp_port,
text_body=text_body,
html_body=html_body,
cc_addrs=cc_addrs,
bcc_addrs=bcc_addrs,
attachments=attachments
)
# Print the result
print(f"Success: {success}")
print(f"Message: {message}")