I'm working for a project for university and I would like to analyze the features of the most popular tv shows using web scarping and text mining. So I tried to scrape the url from every tv show in the list in this site https://www.imdb.com/chart/toptv/ using the following code in Python, but in output I only receive the site url.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.imdb.com/chart/toptv/m"
df = pd.DataFrame()
links = []
def extract_links(url):
print("source url",url)
global links
source_url = requests.get(url)
soup = BeautifulSoup(source_url.content,"html.parser")
for link in soup.find_all('a',href=True):
try:
if len(links) >=100:
return
if link.get('href').startswith("https://") and link.get("href") not in links:
links.append(link.get('href'))
extract_links(link.get('href'))
except Exception as e:
print("Unhandled exception",e)
extract_links(url)
df = pd.DataFrame({"links":links})
df.to_csv("links.csv")
I also searched in the web and I found this code, but it doesn't work either.
import requests
from bs4 import BeautifulSoup
# send a GET request to the website
url = 'https://www.imdb.com/chart/toptv/'
response = requests.get(url)
# parse the HTML content of the page with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# find all links on the page
links = soup.find_all('a')
# print the href attribute of each link
for link in links:
print(link.get('href'))
Can someone help me and tell me what I'm doing wrong? Thank you