Add metadata extraction for missing parameters
This commit is contained in:
parent
9fdb906183
commit
d1da791858
115
create_app.py
115
create_app.py
|
@ -1,9 +1,87 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"Creates a firefox "
|
"Creates a firefox web app"
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
import os.path as pt
|
import os.path as pt
|
||||||
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
REPO_DIR = pt.dirname(sys.argv[0])
|
||||||
|
BIN_DIR = f"{REPO_DIR}/bin"
|
||||||
|
ICON_DIR = f"{REPO_DIR}/icons"
|
||||||
|
|
||||||
|
|
||||||
|
def eprint(*args, **kwargs):
|
||||||
|
"Print an error"
|
||||||
|
print(*args, **kwargs, file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
def url_exists(url):
|
||||||
|
"Tests if the url exists after all redirects"
|
||||||
|
return requests.head(url, allow_redirects=True).status_code == 200
|
||||||
|
|
||||||
|
|
||||||
|
def absolute_url(base_url, relative_url):
|
||||||
|
"Returns the absolute_url if the relative_url is relative"
|
||||||
|
base = urlparse(base_url)
|
||||||
|
relative = urlparse(relative_url)
|
||||||
|
|
||||||
|
# Make the url absolute if it is not
|
||||||
|
if not relative.hostname:
|
||||||
|
return urlunparse((*base[:2], *relative[2:]))
|
||||||
|
return relative_url
|
||||||
|
|
||||||
|
|
||||||
|
def extract_metadata(url):
|
||||||
|
"Extract metadata using bs4"
|
||||||
|
# Get and parse the page
|
||||||
|
content = requests.get(url, allow_redirects=True).content
|
||||||
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
|
metadata = {}
|
||||||
|
|
||||||
|
# Find the title
|
||||||
|
titles = [soup.title.string]
|
||||||
|
for tag in soup.find_all("meta"):
|
||||||
|
title_props = ["title", "og:title", "twitter:title"]
|
||||||
|
if tag.get("property", None) in title_props \
|
||||||
|
or tag.get("name", None) in title_props:
|
||||||
|
titles.append(tag["content"])
|
||||||
|
# Set title to the most common if it occurs more than once, else prefer
|
||||||
|
# title tag
|
||||||
|
most_common = Counter(titles).most_common(1)[0]
|
||||||
|
if most_common[1] > 1:
|
||||||
|
metadata["title"] = most_common[0].strip()
|
||||||
|
else:
|
||||||
|
metadata["title"] = soup.title.string.strip()
|
||||||
|
|
||||||
|
# Find the image.
|
||||||
|
# Try link first, followed by /favicon.{png,ico}, followed by og:, twitter:
|
||||||
|
image = None
|
||||||
|
for favicon in soup.find_all("link", rel="icon"):
|
||||||
|
if url_exists(absolute_url(url, favicon["href"])):
|
||||||
|
image = absolute_url(url, favicon["href"])
|
||||||
|
|
||||||
|
if not image:
|
||||||
|
for favicon in [absolute_url(url, i) for i in ("favicon.png", "favicon.ico")]:
|
||||||
|
if requests.head(favicon, allow_redirects=True).status_code == 200:
|
||||||
|
image = favicon
|
||||||
|
break
|
||||||
|
|
||||||
|
if not image:
|
||||||
|
for prop in ["og:image", "twitter:image"]:
|
||||||
|
prop_tag = soup.find("meta", property=prop)
|
||||||
|
if prop_tag and url_exists(absolute_url(url, prop_tag["content"])):
|
||||||
|
image = absolute_url(url, prop_tag["content"])
|
||||||
|
break
|
||||||
|
|
||||||
|
metadata["image"] = image
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"Main Function"
|
"Main Function"
|
||||||
|
@ -33,10 +111,41 @@ def main():
|
||||||
|
|
||||||
# Add Missing Arguments with default values
|
# Add Missing Arguments with default values
|
||||||
if args.firefox_profile is None:
|
if args.firefox_profile is None:
|
||||||
profile_path = pt.dirname(sys.argv[0]) + "/.firefox_profile"
|
profile_path = REPO_DIR + "/.firefox_profile"
|
||||||
with open(profile_path) as prof:
|
with open(profile_path) as prof:
|
||||||
args.firefox_profile = prof.readline()[:-1]
|
args.firefox_profile = prof.readline()[:-1]
|
||||||
print(args)
|
|
||||||
|
parsed_url = urlparse(args.url)
|
||||||
|
if not parsed_url.scheme:
|
||||||
|
eprint("Missing URL scheme")
|
||||||
|
eprint(f"Maybe you meant https://{args.url} ?")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
metadata = extract_metadata(args.url)
|
||||||
|
if not args.name:
|
||||||
|
args.name = metadata["title"]
|
||||||
|
if not args.logo:
|
||||||
|
args.logo = metadata["image"]
|
||||||
|
if not args.exec_name:
|
||||||
|
args.exec_name = parsed_url.hostname.replace(".", "-") + "-webapp"
|
||||||
|
|
||||||
|
if "/" in args.exec_name:
|
||||||
|
eprint("Executable name can't contain slashes.")
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
if pt.exists(f"{BIN_DIR}/{args.exec_name}"):
|
||||||
|
index = 0
|
||||||
|
while True:
|
||||||
|
if not pt.exists(f"{BIN_DIR}/{args.exec_name}-{index}"):
|
||||||
|
args.exec_name = f"{args.exec_name}-{index}"
|
||||||
|
break
|
||||||
|
index += 1
|
||||||
|
print()
|
||||||
|
print(f"WebApp Name:\t\t{args.name}")
|
||||||
|
print(f"WebApp URL:\t\t{args.url}")
|
||||||
|
print(f"Logo URL:\t\t{args.logo}")
|
||||||
|
print(f"Executable Name:\t{args.exec_name}")
|
||||||
|
print(f"Firefox Profile:\t{args.firefox_profile}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in New Issue