From 9137953537296ad4a421730d76984ea63e05a5d6 Mon Sep 17 00:00:00 2001 From: Ceda EI Date: Thu, 5 Nov 2020 19:41:44 +0530 Subject: [PATCH] Handle edge cases with no title for metadata extraction --- create_app.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/create_app.py b/create_app.py index c63a247..b6f6170 100755 --- a/create_app.py +++ b/create_app.py @@ -45,7 +45,9 @@ def extract_metadata(url): metadata = {} # Find the title - titles = [soup.title.string] + titles = [] + if soup.title: + titles = [soup.title.string] for tag in soup.find_all("meta"): title_props = ["title", "og:title", "twitter:title"] if tag.get("property", None) in title_props \ @@ -53,11 +55,16 @@ def extract_metadata(url): titles.append(tag["content"]) # Set title to the most common if it occurs more than once, else prefer # title tag - most_common = Counter(titles).most_common(1)[0] - if most_common[1] > 1: - metadata["title"] = most_common[0].strip() + most_common = Counter(titles).most_common(1) + if not most_common: + metadata["title"] = None + elif most_common[0][1] > 1: + metadata["title"] = most_common[0][0].strip() else: - metadata["title"] = soup.title.string.strip() + if soup.title: + metadata["title"] = soup.title.string.strip() + else: + metadata["title"] = most_common[0][0].strip() # Find the image. # Try link first, followed by /favicon.{png,ico}, followed by og:, twitter: @@ -121,6 +128,7 @@ def main(): eprint(f"Maybe you meant https://{args.url} ?") sys.exit(1) + print("Fetching details ...") metadata = extract_metadata(args.url) if not args.name: args.name = metadata["title"]