Usage Guide
===========
Installation
------------
Install the package using pip:
.. code-block:: bash
pip install scrapery
-------------------------------
HTML Example
-------------------------------
.. code-block:: python
from scrapery import *
html_content = """
Welcome
Hello
World
About Us
"""
# Parse HTML content
html_doc = parse_html(html_content)
# Pretty print HTML
print(prettify(html_doc))
# Get all table rows
rows = select_all(html_doc, "table tr")
for row in rows:
print(selector_content(row))
# Get first paragraph
paragraph = select_one(html_doc, "p")
print("Paragraph:", selector_content(paragraph))
# CSS and XPath Selectors
print(selector_content(html_doc, selector="h1")) # CSS
print(selector_content(html_doc, selector="//h1")) # XPath
print(selector_content(html_doc, selector="a", attr="href")) # CSS attr
print(selector_content(html_doc, selector="//a", attr="href")) # XPath attr
# Get specific table cells
print(selector_content(html_doc, selector="td")) # First
print(selector_content(html_doc, selector="//td[2]")) # Second |
print(selector_content(html_doc, selector="//tr[3]/td[2]")) # Jane's age
# Full text content
print(selector_content(html_doc))
# Root attribute
print(selector_content(html_doc, attr="lang"))
Embedded Data
-------------
.. code-block:: python
html_content = """
"""
json_data = embedded_json(page_source=html_content, start_keyword="window.__INITIAL_STATE__ =")
print(json_data)
.. code-block:: python
html_with_ldjson = """
"""
ld_json = embedded_json(page_source=html_with_ldjson, selector = "[type*='application/ld+json']")
print(ld_json)
DOM Navigation
--------------
.. code-block:: python
p_elem = select_one(html_doc, "p")
print("Parent:", parent(p_elem).tag)
print("Children:", [c.tag for c in children(p_elem)])
print("Siblings:", [s.tag for s in siblings(p_elem)])
print("Next sibling of :", next_sibling(p_elem).tag)
h1_elem = select_one(html_doc, "h1")
print("Previous sibling of :", next_sibling(h1_elem))
ancs = ancestors(p_elem)
print("Ancestors:", [a.tag for a in ancs])
desc = descendants(select_one(html_doc, "table"))
print("Descendants:", [d.tag for d in desc])
Class Utilities
---------------
.. code-block:: python
div_html = ''
div_elem = parse_html(div_html)
print("Has class 'card'? ->", has_class(div_elem, "card"))
print("Classes:", get_classes(div_elem))
Resolve Relative URLs
----------------------
.. code-block:: python
base = "https://example.com"
print(absolute_url(html_doc, "a", base_url=base))
print(absolute_url(html_doc, "img", base_url=base, attr="src"))
XML Example
-----------
.. code-block:: python
xml_content = "Test"
xml_doc = parse_xml(xml_content)
print(prettify(xml_doc))
all_elements = select_all(xml_doc, "child")
print(all_elements)
child = select_one(xml_doc, "//child")
print(child)
print(selector_content(xml_doc, "child"))
print(parent(child))
print(children(xml_doc))
print(xml_find(xml_doc, "child"))
print(xml_find_all(xml_doc, "child"))
print(xml_xpath(xml_doc, "//child"))
xslt = """
"""
transformed = xml_transform(xml_doc, xslt)
print(prettify(transformed))
# Validate (requires schema file)
is_valid = xml_validate_xsd(xml_doc, Path("schema.xsd"))
print(is_valid)
new_element = xml_create_element("newTag", text="This is new", id="123")
xml_add_child(xml_doc, new_element)
xml_set_attr(new_element, "id", "456")
print(prettify(xml_doc))
JSON Example
------------
.. code-block:: python
json_str = '{"user": {"profile": {"name": "Alice"}}}'
print(json_content(json_str, keys=["name"], position="first"))
print(json_content(json_str, keys=["user", "profile", "name"], position="last"))
Useful Utilities
----------------
Create a Directory
------------------
.. code-block:: python
from scrapery import create_directory
create_directory("new_folder")
create_directory("parent_folder/sub_folder")
Standardize a String
--------------------
.. code-block:: python
from scrapery import standardized_string
# This function standardizes the input string by removing escape sequences like \n, \t, and \r, removing HTML tags, collapsing multiple spaces, and trimming leading/trailing spaces.
# Example 1: Standardize a string with newlines, tabs, and HTML tags
input_string_1 = " Hello \nWorld! \tThis is a test. "
print("Standardized String 1:", standardized_string(input_string_1))
# Example 2: Input string with multiple spaces and line breaks
input_string_2 = " This is a \n\n string with spaces and \t tabs. "
print("Standardized String 2:", standardized_string(input_string_2))
# Example 3: Pass an empty string
input_string_3 = ""
print("Standardized String 3:", standardized_string(input_string_3))
# Example 4: Pass None (invalid input)
input_string_4 = None
print("Standardized String 4:", standardized_string(input_string_4))
Replace a String
----------------
.. code-block:: python
from scrapery import replace_content
text = "posting posting posting"
# Example 1: Replace all occurrences
result = replace_content(text, "posting", "UPDATED")
print(result)
# Output: "UPDATED UPDATED UPDATED"
# Example 2: Replace only the 2nd occurrence (position)
result = replace_content(text, "posting", "UPDATED", position=2)
print(result)
# Output: "posting UPDATED posting"
# Example 3: Case-insensitive replacement
text = "Posting POSTING posting"
result = replace_content(text, "posting", "edited", ignore_case=True, position=2)
print(result)
# Output: "Posting edited posting"
# Example 4: Limit number of replacements (count)
text = "apple apple apple"
result = replace_content(text, "apple", "orange", count=2)
print(result)
# Output: "orange orange apple"
# Example 5: Replace in a file
# example.txt contains: "error error error"
replace_content("example.txt", "error", "warning", ignore_case=True)
# The file now contains: "warning warning warning"
Read CSV
-------
.. code-block:: python
from scrapery import read_csv
result = read_csv('data.csv', 'URL', 'Category', ['Tech'])
print(result)
Save to CSV
----------
.. code-block:: python
from scrapery import save_to_csv
list_data = [[1, 'Alice', 23], [2, 'Bob', 30], [3, 'Charlie', 25]]
headers = ['ID', 'Name', 'Age']
output_file_path = 'output.csv'
save_to_csv(list_data, headers, output_file_path)
Save to Excel
-------------
.. code-block:: python
from scrapery import save_to_xls
save_to_xls(list_data, headers, 'output.xlsx')
Save to Sqlite Database
-----------------------
.. code-block:: python
from scrapery import save_to_db
#Creates a SQLite database file named data.sqlite in the current folder and adds a table called data.
save_to_db(data_list, headers)
#Creates a SQLite database file named mydb.sqlite in the given folder (report) and adds a table called User.
save_to_db(data_list, headers, auto_data_type=False, output_file_path="report/mydb.sqlite", table_name="User")
List Files in a Directory
-------------------------
.. code-block:: python
from scrapery import list_files
files = list_files(directory="output", extension="csv")
print(files)
Read File Content
-----------------
.. code-block:: python
from scrapery import read_file_content
small_json = read_file_content("small.json", stream_json=False)
print(small_json)
stream = read_file_content("large.json", stream_json=True)
for obj in stream:
print(obj)
text = read_file_content("large_text.txt")
print(text[:500])
Save to File
------------
.. code-block:: python
from scrapery import save_file_content
save_file_content("output.txt", "Hello World")
save_file_content("data.json", {"name": "Alice"})
save_file_content("append.txt", "\nAnother line", mode="a")
Send mail
------------
.. code-block:: python
from scrapery import send_email
smtp_server = "smtp.gmail.com" # For Gmail, change if using other services
sender_email = "your_email@gmail.com" # Replace with the sender's email address
sender_passwd = "your_email_password" # Replace with the sender's email password (consider using OAuth for security)
to_addrs = ["recipient1@example.com", "recipient2@example.com"] # List of recipient email addresses
subject = "Test Email with Attachments"
smtp_port = 465 # SMTP port for Gmail SSL
text_body = "Hello, this is a test email."
html_body = "Hello, this is a test email."
cc_addrs = ["cc_recipient@example.com"] # Optional: list of CC recipients
bcc_addrs = ["bcc_recipient@example.com"] # Optional: list of BCC recipients
attachments = [Path("/path/to/file1.pdf"), Path("/path/to/image.png")] # Optional: list of file paths to attach
# Call the send_email function
success, message = send_email(
smtp_server=smtp_server,
sender_email=sender_email,
sender_passwd=sender_passwd,
to_addrs=to_addrs,
subject=subject,
smtp_port=smtp_port,
text_body=text_body,
html_body=html_body,
cc_addrs=cc_addrs,
bcc_addrs=bcc_addrs,
attachments=attachments
)
# Print the result
print(f"Success: {success}")
print(f"Message: {message}")
|