Add metadata extraction for missing parameters

This commit is contained in:
Ceda EI 2020-11-03 22:57:56 +05:30
parent 9fdb906183
commit d1da791858
1 changed files with 112 additions and 3 deletions

View File

@ -1,9 +1,87 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"Creates a firefox " "Creates a firefox web app"
import argparse import argparse
import sys import sys
import os.path as pt import os.path as pt
from urllib.parse import urlparse, urlunparse
from collections import Counter
import requests
from bs4 import BeautifulSoup
REPO_DIR = pt.dirname(sys.argv[0])
BIN_DIR = f"{REPO_DIR}/bin"
ICON_DIR = f"{REPO_DIR}/icons"
def eprint(*args, **kwargs):
"Print an error"
print(*args, **kwargs, file=sys.stderr)
def url_exists(url):
"Tests if the url exists after all redirects"
return requests.head(url, allow_redirects=True).status_code == 200
def absolute_url(base_url, relative_url):
"Returns the absolute_url if the relative_url is relative"
base = urlparse(base_url)
relative = urlparse(relative_url)
# Make the url absolute if it is not
if not relative.hostname:
return urlunparse((*base[:2], *relative[2:]))
return relative_url
def extract_metadata(url):
"Extract metadata using bs4"
# Get and parse the page
content = requests.get(url, allow_redirects=True).content
soup = BeautifulSoup(content, 'html.parser')
metadata = {}
# Find the title
titles = [soup.title.string]
for tag in soup.find_all("meta"):
title_props = ["title", "og:title", "twitter:title"]
if tag.get("property", None) in title_props \
or tag.get("name", None) in title_props:
titles.append(tag["content"])
# Set title to the most common if it occurs more than once, else prefer
# title tag
most_common = Counter(titles).most_common(1)[0]
if most_common[1] > 1:
metadata["title"] = most_common[0].strip()
else:
metadata["title"] = soup.title.string.strip()
# Find the image.
# Try link first, followed by /favicon.{png,ico}, followed by og:, twitter:
image = None
for favicon in soup.find_all("link", rel="icon"):
if url_exists(absolute_url(url, favicon["href"])):
image = absolute_url(url, favicon["href"])
if not image:
for favicon in [absolute_url(url, i) for i in ("favicon.png", "favicon.ico")]:
if requests.head(favicon, allow_redirects=True).status_code == 200:
image = favicon
break
if not image:
for prop in ["og:image", "twitter:image"]:
prop_tag = soup.find("meta", property=prop)
if prop_tag and url_exists(absolute_url(url, prop_tag["content"])):
image = absolute_url(url, prop_tag["content"])
break
metadata["image"] = image
return metadata
def main(): def main():
"Main Function" "Main Function"
@ -33,10 +111,41 @@ def main():
# Add Missing Arguments with default values # Add Missing Arguments with default values
if args.firefox_profile is None: if args.firefox_profile is None:
profile_path = pt.dirname(sys.argv[0]) + "/.firefox_profile" profile_path = REPO_DIR + "/.firefox_profile"
with open(profile_path) as prof: with open(profile_path) as prof:
args.firefox_profile = prof.readline()[:-1] args.firefox_profile = prof.readline()[:-1]
print(args)
parsed_url = urlparse(args.url)
if not parsed_url.scheme:
eprint("Missing URL scheme")
eprint(f"Maybe you meant https://{args.url} ?")
sys.exit(1)
metadata = extract_metadata(args.url)
if not args.name:
args.name = metadata["title"]
if not args.logo:
args.logo = metadata["image"]
if not args.exec_name:
args.exec_name = parsed_url.hostname.replace(".", "-") + "-webapp"
if "/" in args.exec_name:
eprint("Executable name can't contain slashes.")
sys.exit(2)
if pt.exists(f"{BIN_DIR}/{args.exec_name}"):
index = 0
while True:
if not pt.exists(f"{BIN_DIR}/{args.exec_name}-{index}"):
args.exec_name = f"{args.exec_name}-{index}"
break
index += 1
print()
print(f"WebApp Name:\t\t{args.name}")
print(f"WebApp URL:\t\t{args.url}")
print(f"Logo URL:\t\t{args.logo}")
print(f"Executable Name:\t{args.exec_name}")
print(f"Firefox Profile:\t{args.firefox_profile}")
if __name__ == "__main__": if __name__ == "__main__":