''' DS2000 Spring 2023 Sample code from class -- quick example on web scraping ''' # import urllib for grabbing web paages and beautifulsoup for parsing from urllib.request import urlopen from bs4 import BeautifulSoup def main(): html = urlopen("https://www.khoury.northeastern.edu/home/laney/simple.html") print(html.read()) # Find the first h1 header in our DS2000 course website html = urlopen("https://course.ccs.neu.edu/ds2000/") bs = BeautifulSoup(html.read(), "html.parser") print(bs.h1) # Find the title in our Ds2500 web page # (WARNING! People who make web pages are not neceesarily consistent # with their use of tags. :) print(bs.title) # Find all the text enclosed in paragraph tags # Note that there's not much in paragraph tags because # we are not smart or consistent graphs = bs.findAll("p") for p in graphs: print(p.get_text()) # Find the first image on the website (it's our Khoury logo!) print(bs.img) # Find all the link text in our website. For accessibility standards, # these should be descriptive. links = bs.findAll({"a" : "href"}) for link in links: print(link.get_text()) main()