Handle edge cases with no title for metadata extraction

This commit is contained in:
Ceda EI 2020-11-05 19:41:44 +05:30
parent 79ef6da558
commit 9137953537
1 changed files with 13 additions and 5 deletions

View File

@ -45,6 +45,8 @@ def extract_metadata(url):
metadata = {} metadata = {}
# Find the title # Find the title
titles = []
if soup.title:
titles = [soup.title.string] titles = [soup.title.string]
for tag in soup.find_all("meta"): for tag in soup.find_all("meta"):
title_props = ["title", "og:title", "twitter:title"] title_props = ["title", "og:title", "twitter:title"]
@ -53,11 +55,16 @@ def extract_metadata(url):
titles.append(tag["content"]) titles.append(tag["content"])
# Set title to the most common if it occurs more than once, else prefer # Set title to the most common if it occurs more than once, else prefer
# title tag # title tag
most_common = Counter(titles).most_common(1)[0] most_common = Counter(titles).most_common(1)
if most_common[1] > 1: if not most_common:
metadata["title"] = most_common[0].strip() metadata["title"] = None
elif most_common[0][1] > 1:
metadata["title"] = most_common[0][0].strip()
else: else:
if soup.title:
metadata["title"] = soup.title.string.strip() metadata["title"] = soup.title.string.strip()
else:
metadata["title"] = most_common[0][0].strip()
# Find the image. # Find the image.
# Try link first, followed by /favicon.{png,ico}, followed by og:, twitter: # Try link first, followed by /favicon.{png,ico}, followed by og:, twitter:
@ -121,6 +128,7 @@ def main():
eprint(f"Maybe you meant https://{args.url} ?") eprint(f"Maybe you meant https://{args.url} ?")
sys.exit(1) sys.exit(1)
print("Fetching details ...")
metadata = extract_metadata(args.url) metadata = extract_metadata(args.url)
if not args.name: if not args.name:
args.name = metadata["title"] args.name = metadata["title"]