|
| 1 | +BeautifulSoup Cheatsheet |
| 2 | +BeautifulSoup is a Python library for parsing HTML and XML documents. It creates a parse tree for parsed pages that can be used to extract data from HTML, which is useful for web scraping. |
| 3 | + |
| 4 | +1. Installation |
| 5 | +If you don't have BeautifulSoup installed, you can install it using pip: |
| 6 | + |
| 7 | +pip install beautifulsoup4 |
| 8 | +pip install lxml # Optional, but recommended for faster parsing |
| 9 | + |
| 10 | +2. Basic Parsing |
| 11 | +To get started, you need to import BeautifulSoup and provide it with the HTML/XML content. |
| 12 | + |
| 13 | +from bs4 import BeautifulSoup |
| 14 | + |
| 15 | +# Sample HTML content |
| 16 | +html_doc = """ |
| 17 | +<html><head><title>My Document</title></head> |
| 18 | +<body> |
| 19 | + <p class="title"><b>The Document Title</b></p> |
| 20 | + |
| 21 | + <a href="http://example.com/one" class="external-link" id="link1">Link One</a> |
| 22 | + <a href="http://example.com/two" class="external-link" id="link2">Link Two</a> |
| 23 | + <p>This is some other content.</p> |
| 24 | + <div class="container"> |
| 25 | + <ul> |
| 26 | + <li>Item 1</li> |
| 27 | + <li>Item 2</li> |
| 28 | + </ul> |
| 29 | + </div> |
| 30 | +</body> |
| 31 | +</html> |
| 32 | +""" |
| 33 | + |
| 34 | +# Create a BeautifulSoup object |
| 35 | +# 'lxml' is a common and fast parser; 'html.parser' is built-in |
| 36 | +soup = BeautifulSoup(html_doc, 'lxml') |
| 37 | + |
| 38 | +# Pretty-print the parsed HTML |
| 39 | +print(soup.prettify()) |
| 40 | + |
| 41 | +3. Navigating the Parse Tree |
| 42 | +BeautifulSoup allows you to navigate the parsed document using object-oriented access. |
| 43 | + |
| 44 | +Accessing Tags |
| 45 | +You can access tags directly as attributes of the BeautifulSoup object or other tags. |
| 46 | + |
| 47 | +# Get the first <head> tag |
| 48 | +head_tag = soup.head |
| 49 | +print(f"Head Tag: {head_tag}") |
| 50 | + |
| 51 | +# Get the first <title> tag within <head> |
| 52 | +title_tag = soup.title |
| 53 | +print(f"Title Tag: {title_tag}") |
| 54 | + |
| 55 | +# Get the tag's name |
| 56 | +print(f"Title Tag Name: {title_tag.name}") |
| 57 | + |
| 58 | +# Get the string content of the tag |
| 59 | +print(f"Title Tag Content: {title_tag.string}") |
| 60 | + |
| 61 | +# Accessing attributes of a tag |
| 62 | +link_one = soup.a |
| 63 | +print(f"Link One href: {link_one['href']}") |
| 64 | +print(f"Link One class: {link_one['class']}") |
| 65 | + |
| 66 | +# Get all attributes as a dictionary |
| 67 | +print(f"Link One attributes: {link_one.attrs}") |
| 68 | + |
| 69 | +Navigating Down |
| 70 | +.contents: A list of the tag's direct children. |
| 71 | + |
| 72 | +.children: An iterator of the tag's direct children. |
| 73 | + |
| 74 | +.descendants: An iterator of all children, grandchildren, etc. |
| 75 | + |
| 76 | +body_tag = soup.body |
| 77 | +print("\nBody Tag Contents:") |
| 78 | +for child in body_tag.contents: |
| 79 | + if child.name: # Only print actual tags |
| 80 | + print(child.name) |
| 81 | + |
| 82 | +print("\nBody Tag Descendants (Examples):") |
| 83 | +for descendant in body_tag.descendants: |
| 84 | + if descendant.name: |
| 85 | + print(descendant.name, end=" ") |
| 86 | + if descendant.name == 'li': break # Stop after a few examples |
| 87 | + |
| 88 | +Navigating Up |
| 89 | +.parent: The direct parent of a tag. |
| 90 | + |
| 91 | +.parents: An iterator of all ancestors. |
| 92 | + |
| 93 | +# Find the parent of the title tag |
| 94 | +p_tag = soup.p |
| 95 | +print(f"\nParent of <p>: {p_tag.parent.name}") |
| 96 | + |
| 97 | +# Iterate through parents of a specific link |
| 98 | +link2 = soup.find(id="link2") |
| 99 | +print(f"Parents of <a id='link2'>:") |
| 100 | +for parent in link2.parents: |
| 101 | + if parent is None: |
| 102 | + continue |
| 103 | + if parent.name: |
| 104 | + print(parent.name) |
| 105 | + |
| 106 | +Navigating Sideways |
| 107 | +.next_sibling: The next sibling after the current tag. |
| 108 | + |
| 109 | +.previous_sibling: The previous sibling before the current tag. |
| 110 | + |
| 111 | +.next_siblings: An iterator of all following siblings. |
| 112 | + |
| 113 | +.previous_siblings: An iterator of all preceding siblings. |
| 114 | + |
| 115 | +first_p_tag = soup.p |
| 116 | +print(f"\nNext sibling of first <p>: {first_p_tag.next_sibling.next_sibling.name}") # Skip newline |
| 117 | +print(f"Previous sibling of first <p>: {first_p_tag.previous_sibling.previous_sibling.name}") # Skip newline |
| 118 | + |
| 119 | +print("\nNext siblings of first <p> (examples):") |
| 120 | +for sibling in first_p_tag.next_siblings: |
| 121 | + if sibling.name: |
| 122 | + print(sibling.name) |
| 123 | + |
| 124 | +4. Searching the Tree (find() and find_all()) |
| 125 | +These are the most powerful methods for locating specific elements. |
| 126 | + |
| 127 | +find_all(name, attrs, recursive, string, limit) |
| 128 | +Finds all occurrences of a tag that match the criteria. Returns a list of tags. |
| 129 | + |
| 130 | +name: Tag name (e.g., 'a', 'p'). Can be a string, list, regular expression, or function. |
| 131 | + |
| 132 | +attrs: A dictionary of attribute values (e.g., {'class': 'external-link'}). |
| 133 | + |
| 134 | +recursive: If False, only examines direct children. Default is True. |
| 135 | + |
| 136 | +string: Searches for strings instead of tags. |
| 137 | + |
| 138 | +limit: Stop searching after a certain number of matches. |
| 139 | + |
| 140 | +# Find all <a> tags |
| 141 | +all_links = soup.find_all('a') |
| 142 | +print(f"\nAll links: {all_links}") |
| 143 | + |
| 144 | +# Find all <p> tags with class 'title' |
| 145 | +title_p = soup.find_all('p', class_='title') # 'class_' because 'class' is a Python keyword |
| 146 | +print(f"Paragraphs with class 'title': {title_p}") |
| 147 | + |
| 148 | +# Find all tags that have an 'id' attribute |
| 149 | +tags_with_id = soup.find_all(id=True) |
| 150 | +print(f"Tags with an 'id' attribute: {tags_with_id}") |
| 151 | + |
| 152 | +# Find all <li> tags |
| 153 | +list_items = soup.find_all('li') |
| 154 | +for item in list_items: |
| 155 | + print(f"List Item: {item.string}") |
| 156 | + |
| 157 | +# Find tags containing specific text (using 'string') |
| 158 | +p_with_content = soup.find_all(string="This is some other content.") |
| 159 | +print(f"Tags with specific string content: {p_with_content}") |
| 160 | + |
| 161 | +find(name, attrs, recursive, string) |
| 162 | +Similar to find_all(), but returns only the first match. |
| 163 | + |
| 164 | +# Find the first <a> tag |
| 165 | +first_link = soup.find('a') |
| 166 | +print(f"\nFirst link found: {first_link}") |
| 167 | + |
| 168 | +# Find the first <p> tag with class 'title' |
| 169 | +first_title_p = soup.find('p', class_='title') |
| 170 | +print(f"First paragraph with class 'title': {first_title_p}") |
| 171 | + |
| 172 | +Common Search Patterns |
| 173 | +# By tag name |
| 174 | +print(f"\nFind all 'p' tags: {soup.find_all('p')}") |
| 175 | + |
| 176 | +# By CSS class (note the underscore!) |
| 177 | +print(f"Find all 'a' tags with class 'external-link': {soup.find_all('a', class_='external-link')}") |
| 178 | + |
| 179 | +# By ID |
| 180 | +print(f"Find tag with id 'link1': {soup.find(id='link1')}") |
| 181 | + |
| 182 | +# By attribute value (any attribute) |
| 183 | +print(f"Find all tags with href='http://example.com/one': {soup.find_all(href='http://example.com/one')}") |
| 184 | + |
| 185 | +# Using a list of tag names |
| 186 | +print(f"Find all 'p' or 'a' tags: {soup.find_all(['p', 'a'])}") |
| 187 | + |
| 188 | +# Using regular expressions |
| 189 | +import re |
| 190 | +print(f"Find all tags whose name starts with 'b': {soup.find_all(re.compile("^b"))}") # e.g., <body>, <b> |
| 191 | +print(f"Find all tags with 'link' in their ID: {soup.find_all(id=re.compile("link"))}") |
| 192 | + |
| 193 | +5. Modifying the Tree |
| 194 | +BeautifulSoup allows you to modify the parse tree. |
| 195 | + |
| 196 | +# Sample HTML for modification |
| 197 | +html_mod = """ |
| 198 | +<html> |
| 199 | +<body> |
| 200 | + <p>Original text.</p> |
| 201 | + <div id="target">Content here</div> |
| 202 | +</body> |
| 203 | +</html> |
| 204 | +""" |
| 205 | +soup_mod = BeautifulSoup(html_mod, 'lxml') |
| 206 | + |
| 207 | +# Change tag name |
| 208 | +p_tag_mod = soup_mod.p |
| 209 | +p_tag_mod.name = "div" |
| 210 | +print(f"\nAfter changing p to div: {soup_mod.prettify()}") |
| 211 | + |
| 212 | +# Modify tag attributes |
| 213 | +div_tag_mod = soup_mod.find(id="target") |
| 214 | +div_tag_mod['class'] = 'new-class' |
| 215 | +div_tag_mod['data-type'] = 'example' |
| 216 | +print(f"After modifying attributes: {soup_mod.prettify()}") |
| 217 | + |
| 218 | +# Add new content |
| 219 | +new_tag = soup_mod.new_tag("span") |
| 220 | +new_tag.string = "Added span text." |
| 221 | +div_tag_mod.append(new_tag) # Add inside the div |
| 222 | +print(f"After appending a new tag: {soup_mod.prettify()}") |
| 223 | + |
| 224 | +# Replace content |
| 225 | +div_tag_mod.string = "New replaced content." |
| 226 | +print(f"After replacing div content: {soup_mod.prettify()}") |
| 227 | + |
| 228 | +# Remove content |
| 229 | +span_to_remove = soup_mod.find('span') |
| 230 | +if span_to_remove: |
| 231 | + span_to_remove.decompose() # Removes the tag and its contents |
| 232 | +print(f"After removing span: {soup_mod.prettify()}") |
| 233 | + |
| 234 | +6. CSS Selectors (select() and select_one()) |
| 235 | +BeautifulSoup also supports CSS selectors using the select() method (returns a list) and select_one() (returns the first match). |
| 236 | + |
| 237 | +# Select all <p> tags |
| 238 | +print(f"\nSelect all 'p' tags: {soup.select('p')}") |
| 239 | + |
| 240 | +# Select tags by class |
| 241 | +print(f"Select all elements with class 'external-link': {soup.select('.external-link')}") |
| 242 | + |
| 243 | +# Select tag by ID |
| 244 | +print(f"Select element with id 'link2': {soup.select('#link2')}") |
| 245 | + |
| 246 | +# Select direct children |
| 247 | +print(f"Select direct children <li> of .container: {soup.select('div.container > ul > li')}") |
| 248 | + |
| 249 | +# Select descendants |
| 250 | +print(f"Select all <li> descendants of .container: {soup.select('div.container li')}") |
| 251 | + |
| 252 | +# Select combined selectors |
| 253 | +print(f"Select 'p' or 'a' tags: {soup.select('p, a')}") |
| 254 | + |
| 255 | +# Select tags with specific attribute values |
| 256 | +print(f"Select 'a' tags with href starting with 'http://example.com': {soup.select('a[href^="http://example.com"]')}") |
| 257 | + |
| 258 | +# Select the first matching element |
| 259 | +print(f"Select first link: {soup.select_one('a')}") |
0 commit comments