]> piware.de Git - bin.git/commitdiff
recipekeeper-split: Add master
authorMartin Pitt <martin@piware.de>
Sun, 18 May 2025 19:22:16 +0000 (21:22 +0200)
committerMartin Pitt <martin@piware.de>
Sun, 18 May 2025 19:32:32 +0000 (21:32 +0200)
recipekeeper-split [new file with mode: 0755]

diff --git a/recipekeeper-split b/recipekeeper-split
new file mode 100755 (executable)
index 0000000..b7c9c61
--- /dev/null
@@ -0,0 +1,66 @@
+#!/usr/bin/python3
+import re
+import html
+import xml.etree.ElementTree as ET
+
+input_file = "recipes.html"
+
+
+def safe_filename(name):
+    name = re.sub(r"[^\w\s-]", '', name).strip().replace(' ', '_')
+    return name[:40]
+
+
+with open(input_file, "r") as f:
+    content = f.read()
+
+# 1. Fix <meta ...> tags: make them self-closing
+content_fixed = re.sub(r'(<meta\b[^>]*)(?<!/)>', r'\1 />', content)
+
+# 2. Fix unquoted itemprop attribute values (itemprop=foo -> itemprop="foo")
+# Only match when the value is not already quoted
+content_fixed = re.sub(r'itemprop=([^\s">]+)', r'itemprop="\1"', content_fixed)
+
+# Parse as XML
+tree = ET.ElementTree(ET.fromstring(content_fixed))
+root = tree.getroot()
+
+head = root.find('head')
+body = root.find('body')
+recipes = [div for div in body.findall('div') if div.get('class') == 'recipe-details']
+
+# Prepare the <meta http-equiv="content-type"...> tag string
+meta_tag = '<meta http-equiv="content-type" content="text/html; charset=utf-8" />'
+
+for recipe in recipes:
+    h2 = recipe.find('.//h2[@itemprop="name"]')
+    meta_id = recipe.find('.//meta[@itemprop="recipeId"]')
+    if h2 is not None and h2.text and h2.text.strip():
+        base = safe_filename(html.unescape(h2.text))
+    elif meta_id is not None:
+        base = meta_id.attrib['content']
+    else:
+        base = "recipe"
+
+    filename = f"{base}.html"
+
+    # Convert head to string and insert the meta tag after <head>
+    head_str = ET.tostring(head, encoding="unicode")
+    head_str = re.sub(
+        r'(<head.*?>)',  # Match opening <head> tag (with possible attributes)
+        r'\1\n' + meta_tag,  # Insert meta tag right after opening tag
+        head_str,
+        count=1,
+        flags=re.IGNORECASE|re.DOTALL
+    )
+
+    out_html = (
+        '<!DOCTYPE html>\n<html>\n' +
+        head_str + '\n' +
+        '<body>\n' +
+        ET.tostring(recipe, encoding="unicode") + '\n' +
+        '</body>\n</html>'
+    )
+    with open(filename, "w", encoding="utf-8") as out:
+        out.write(out_html)
+    print(f"Wrote: {filename}")