Sunday, June 13, 2021

Web scraping with Beautiful Soup

# https://www.youtube.com/watch?v=myAFVM7CxWk

from urllib.request import urlopen
from bs4 import BeautifulSoup

url = "https://webscraper.io/test-sites/tables"
html_code = urlopen(url).read().decode("utf-8")
#print(html_code)

start = html_code.find("<h1>") + len("<h1>")
end = html_code.find("</h1>")
#print(html_code[start:end])

soup = BeautifulSoup(html_code, "lxml")
headings_2 = soup.find_all("h2")
#print(headings_2)

images = soup.find_all("img")
#print(images[1]["src"])
#print(images[1]["alt"])

first_table = soup.find("table")
rows = first_table.findAll("tr")[1:]
last_names = []
for row in rows:
  last_names.append(row.findAll("td")[2].get_text())
#print(last_names)

######################################

url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
html_code = urlopen(url).read().decode("utf-8")
soup = BeautifulSoup(html_code, "lxml")

type_table = soup.find(class_="wikitable")
body = type_table.find("tbody")
rows = body.find_all("tr")[1:]
mutable_types, immutable_types = [], []
for row in rows:
  data = row.find_all("td")
  if data[1].get_text() == "mutable\n":
    mutable_types.append(data[0].get_text())
  else:
    immutable_types.append(data[0].get_text())

#print(f"Mutable Types: {mutable_types}")
#print(f"Immutable Types: {immutable_types}")

thumb_box = soup.find(class_="thumb")
thumb_img_src = thumb_box.find("img")["src"]
#print(thumb_img_src)

toc = soup.find(class_="toc")
toc_text = [a.get_text() for a in toc.find_all("a")]
print(toc_text)

No comments:

Post a Comment