Handle edge cases with no title for metadata extraction

2025-08-10 01:56:54 +02:00 · 2020-11-05 19:41:44 +05:30 · 2020-11-05 19:41:44 +05:30 · 9137953537
commit 9137953537
parent 79ef6da558
1 changed files with 13 additions and 5 deletions
--- a/create_app.py
+++ b/create_app.py
@ -45,7 +45,9 @@ def extract_metadata(url):
    metadata = {}

    # Find the title
-    titles = [soup.title.string]
+    titles = []
+    if soup.title:
+        titles = [soup.title.string]
    for tag in soup.find_all("meta"):
        title_props = ["title", "og:title", "twitter:title"]
        if tag.get("property", None) in title_props \
@ -53,11 +55,16 @@ def extract_metadata(url):
            titles.append(tag["content"])
    # Set title to the most common if it occurs more than once, else prefer
    # title tag
-    most_common = Counter(titles).most_common(1)[0]
-    if most_common[1] > 1:
-        metadata["title"] = most_common[0].strip()
+    most_common = Counter(titles).most_common(1)
+    if not most_common:
+        metadata["title"] = None
+    elif most_common[0][1] > 1:
+        metadata["title"] = most_common[0][0].strip()
    else:
-        metadata["title"] = soup.title.string.strip()
+        if soup.title:
+            metadata["title"] = soup.title.string.strip()
+        else:
+            metadata["title"] = most_common[0][0].strip()

    # Find the image.
    # Try link first, followed by /favicon.{png,ico}, followed by og:, twitter:
@ -121,6 +128,7 @@ def main():
        eprint(f"Maybe you meant https://{args.url} ?")
        sys.exit(1)

+    print("Fetching details ...")
    metadata = extract_metadata(args.url)
    if not args.name:
        args.name = metadata["title"]