from cs50 import get_string from bs4 import BeautifulSoup from requests import get import string # scrapes the title, first sentence, and first image of a wikipedia page def main(): # prompt user for a wiki page q = get_string("Which Wikipedia article? ") url_q = add_underscores(q) # get wikipedia page page = get("https://en.wikipedia.org/wiki/" + url_q) # parse html soup = BeautifulSoup(page.content, 'html.parser') title = soup.find('h1', id='firstHeading').get_text() # get image in table img = soup.find('table').find('img').get('src') if not img: img = "No available image" body = soup.find('div', class_='mw-parser-output') if not body: print("No Wikipedia page available for " + q) return 1 # get first 5 p tags in body body_text = body.find_all('p', limit=5) print("Title: " + title) print("Description: " + first_sentence(body_text, q.split(" ")[0])) print("Img URL: " + img[2:len(img)]) # return the first sentence containing a keyword in a block of tags def first_sentence(b, q): # for each tag in a block of tags for p in b: sentences = p.get_text().split(".") # for each sentence of text in a tag for s in sentences: if q.lower() in s.lower(): return s return b[0].get_text().split(".")[0] def add_underscores(s): return s.replace(" ", "_") if __name__ == "__main__": main()