Add metadata extraction for missing parameters

2025-12-23 19:20:06 +01:00 · 2020-11-03 22:57:56 +05:30
parent 9fdb906183
commit d1da791858
1 changed files with 112 additions and 3 deletions
--- a/create_app.py
+++ b/create_app.py
@@ -1,9 +1,87 @@
 #!/usr/bin/env python3
-"Creates a firefox "
+"Creates a firefox web app"

 import argparse
 import sys
 import os.path as pt
+from urllib.parse import urlparse, urlunparse
+from collections import Counter
+
+import requests
+from bs4 import BeautifulSoup
+
+
+REPO_DIR = pt.dirname(sys.argv[0])
+BIN_DIR = f"{REPO_DIR}/bin"
+ICON_DIR = f"{REPO_DIR}/icons"
+
+
+def eprint(*args, **kwargs):
+    "Print an error"
+    print(*args, **kwargs, file=sys.stderr)
+
+
+def url_exists(url):
+    "Tests if the url exists after all redirects"
+    return requests.head(url, allow_redirects=True).status_code == 200
+
+
+def absolute_url(base_url, relative_url):
+    "Returns the absolute_url if the relative_url is relative"
+    base = urlparse(base_url)
+    relative = urlparse(relative_url)
+
+    # Make the url absolute if it is not
+    if not relative.hostname:
+        return urlunparse((*base[:2], *relative[2:]))
+    return relative_url
+
+
+def extract_metadata(url):
+    "Extract metadata using bs4"
+    # Get and parse the page
+    content = requests.get(url, allow_redirects=True).content
+    soup = BeautifulSoup(content, 'html.parser')
+    metadata = {}
+
+    # Find the title
+    titles = [soup.title.string]
+    for tag in soup.find_all("meta"):
+        title_props = ["title", "og:title", "twitter:title"]
+        if tag.get("property", None) in title_props \
+           or tag.get("name", None) in title_props:
+            titles.append(tag["content"])
+    # Set title to the most common if it occurs more than once, else prefer
+    # title tag
+    most_common = Counter(titles).most_common(1)[0]
+    if most_common[1] > 1:
+        metadata["title"] = most_common[0].strip()
+    else:
+        metadata["title"] = soup.title.string.strip()
+
+    # Find the image.
+    # Try link first, followed by /favicon.{png,ico}, followed by og:, twitter:
+    image = None
+    for favicon in soup.find_all("link", rel="icon"):
+        if url_exists(absolute_url(url, favicon["href"])):
+            image = absolute_url(url, favicon["href"])
+
+    if not image:
+        for favicon in [absolute_url(url, i) for i in ("favicon.png", "favicon.ico")]:
+            if requests.head(favicon, allow_redirects=True).status_code == 200:
+                image = favicon
+                break
+
+    if not image:
+        for prop in ["og:image", "twitter:image"]:
+            prop_tag = soup.find("meta", property=prop)
+            if prop_tag and url_exists(absolute_url(url, prop_tag["content"])):
+                image = absolute_url(url, prop_tag["content"])
+                break
+
+    metadata["image"] = image
+    return metadata
+

 def main():
    "Main Function"
@@ -33,10 +111,41 @@ def main():

    # Add Missing Arguments with default values
    if args.firefox_profile is None:
-        profile_path = pt.dirname(sys.argv[0]) + "/.firefox_profile"
+        profile_path = REPO_DIR + "/.firefox_profile"
        with open(profile_path) as prof:
            args.firefox_profile = prof.readline()[:-1]
-    print(args)
+
+    parsed_url = urlparse(args.url)
+    if not parsed_url.scheme:
+        eprint("Missing URL scheme")
+        eprint(f"Maybe you meant https://{args.url} ?")
+        sys.exit(1)
+
+    metadata = extract_metadata(args.url)
+    if not args.name:
+        args.name = metadata["title"]
+    if not args.logo:
+        args.logo = metadata["image"]
+    if not args.exec_name:
+        args.exec_name = parsed_url.hostname.replace(".", "-") + "-webapp"
+
+    if "/" in args.exec_name:
+        eprint("Executable name can't contain slashes.")
+        sys.exit(2)
+
+    if pt.exists(f"{BIN_DIR}/{args.exec_name}"):
+        index = 0
+        while True:
+            if not pt.exists(f"{BIN_DIR}/{args.exec_name}-{index}"):
+                args.exec_name = f"{args.exec_name}-{index}"
+                break
+            index += 1
+    print()
+    print(f"WebApp Name:\t\t{args.name}")
+    print(f"WebApp URL:\t\t{args.url}")
+    print(f"Logo URL:\t\t{args.logo}")
+    print(f"Executable Name:\t{args.exec_name}")
+    print(f"Firefox Profile:\t{args.firefox_profile}")


 if __name__ == "__main__":