Handle edge cases with no title for metadata extraction
This commit is contained in:
parent
79ef6da558
commit
9137953537
|
@ -45,6 +45,8 @@ def extract_metadata(url):
|
|||
metadata = {}
|
||||
|
||||
# Find the title
|
||||
titles = []
|
||||
if soup.title:
|
||||
titles = [soup.title.string]
|
||||
for tag in soup.find_all("meta"):
|
||||
title_props = ["title", "og:title", "twitter:title"]
|
||||
|
@ -53,11 +55,16 @@ def extract_metadata(url):
|
|||
titles.append(tag["content"])
|
||||
# Set title to the most common if it occurs more than once, else prefer
|
||||
# title tag
|
||||
most_common = Counter(titles).most_common(1)[0]
|
||||
if most_common[1] > 1:
|
||||
metadata["title"] = most_common[0].strip()
|
||||
most_common = Counter(titles).most_common(1)
|
||||
if not most_common:
|
||||
metadata["title"] = None
|
||||
elif most_common[0][1] > 1:
|
||||
metadata["title"] = most_common[0][0].strip()
|
||||
else:
|
||||
if soup.title:
|
||||
metadata["title"] = soup.title.string.strip()
|
||||
else:
|
||||
metadata["title"] = most_common[0][0].strip()
|
||||
|
||||
# Find the image.
|
||||
# Try link first, followed by /favicon.{png,ico}, followed by og:, twitter:
|
||||
|
@ -121,6 +128,7 @@ def main():
|
|||
eprint(f"Maybe you meant https://{args.url} ?")
|
||||
sys.exit(1)
|
||||
|
||||
print("Fetching details ...")
|
||||
metadata = extract_metadata(args.url)
|
||||
if not args.name:
|
||||
args.name = metadata["title"]
|
||||
|
|
Loading…
Reference in New Issue