Skip to content

Commit 6a9b95f

Browse files
committed
Add SEO post-processing for cloud wiki
1 parent 40b954c commit 6a9b95f

File tree

5 files changed

+282
-4
lines changed

5 files changed

+282
-4
lines changed

.github/workflows/build_master.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,15 @@ jobs:
3434
# Build the mdBook
3535
- name: Build mdBook
3636
run: MDBOOK_BOOK__LANGUAGE=en mdbook build || (echo "Error logs" && cat hacktricks-preprocessor-error.log && echo "" && echo "" && echo "Debug logs" && (cat hacktricks-preprocessor.log | tail -n 20) && exit 1)
37+
38+
- name: Post-process SEO artifacts
39+
run: |
40+
python3 scripts/seo_postprocess.py pages \
41+
--book-dir ./book \
42+
--site-url https://cloud.hacktricks.wiki \
43+
--lang en \
44+
--default-lang en \
45+
--site-name "HackTricks Cloud"
3746
3847
- name: Push search index to hacktricks-searchindex repo
3948
shell: bash
@@ -149,6 +158,15 @@ jobs:
149158
- name: Sync to S3
150159
run: aws s3 sync ./book s3://hacktricks-cloud/en --delete
151160

161+
- name: Upload root sitemap index
162+
run: |
163+
LANGS=$(aws s3api list-objects-v2 --bucket hacktricks-cloud --delimiter / --query 'CommonPrefixes[].Prefix' --output text | tr '\t' '\n' | sed 's:/$::' | grep -E '^[a-z]{2}$' | sort | paste -sd, -)
164+
if [ -z "$LANGS" ]; then
165+
LANGS="en"
166+
fi
167+
python3 scripts/seo_postprocess.py index --site-url https://cloud.hacktricks.wiki --languages "$LANGS" --output ./sitemap.xml
168+
aws s3 cp ./sitemap.xml s3://hacktricks-cloud/sitemap.xml --content-type application/xml --cache-control max-age=300
169+
152170
- name: Upload root ads.txt
153171
run: |
154172
aws s3 cp ./ads.txt s3://hacktricks-cloud/ads.txt --content-type text/plain --cache-control max-age=300

.github/workflows/translate_all.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,15 @@ jobs:
254254
with:
255255
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
256256
aws-region: us-east-1
257+
258+
- name: Post-process SEO artifacts
259+
run: |
260+
python3 scripts/seo_postprocess.py pages \
261+
--book-dir ./book \
262+
--site-url https://cloud.hacktricks.wiki \
263+
--lang "$BRANCH" \
264+
--default-lang en \
265+
--site-name "HackTricks Cloud"
257266
258267
# Sync the build to S3
259268
- name: Sync to S3
@@ -265,3 +274,12 @@ jobs:
265274
echo "Sync completed"
266275
echo "Cat 3 files from the book"
267276
find . -type f -name 'index.html' -print | head -n 3 | xargs -r cat
277+
278+
- name: Refresh root sitemap index
279+
run: |
280+
LANGS=$(aws s3api list-objects-v2 --bucket hacktricks-cloud --delimiter / --query 'CommonPrefixes[].Prefix' --output text | tr '\t' '\n' | sed 's:/$::' | grep -E '^[a-z]{2}$' | sort | paste -sd, -)
281+
if [ -z "$LANGS" ]; then
282+
LANGS="en"
283+
fi
284+
python3 scripts/seo_postprocess.py index --site-url https://cloud.hacktricks.wiki --languages "$LANGS" --output ./sitemap.xml
285+
aws s3 cp ./sitemap.xml s3://hacktricks-cloud/sitemap.xml --content-type application/xml --cache-control max-age=300

scripts/seo_postprocess.py

Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
import argparse
2+
import html
3+
import re
4+
from datetime import datetime, timezone
5+
from pathlib import Path
6+
import xml.etree.ElementTree as ET
7+
8+
9+
DEFAULT_LANGUAGES = [
10+
"af",
11+
"zh",
12+
"es",
13+
"en",
14+
"fr",
15+
"de",
16+
"el",
17+
"hi",
18+
"it",
19+
"ja",
20+
"ko",
21+
"pl",
22+
"pt",
23+
"sr",
24+
"sw",
25+
"tr",
26+
"uk",
27+
]
28+
29+
SKIP_HTML = {"404.html", "print.html", "toc.html"}
30+
SEO_START = "<!-- HT_SEO_START -->"
31+
SEO_END = "<!-- HT_SEO_END -->"
32+
33+
34+
def parse_args():
35+
parser = argparse.ArgumentParser()
36+
subparsers = parser.add_subparsers(dest="command", required=True)
37+
38+
pages = subparsers.add_parser("pages")
39+
pages.add_argument("--book-dir", required=True)
40+
pages.add_argument("--site-url", required=True)
41+
pages.add_argument("--lang", required=True)
42+
pages.add_argument("--default-lang", default="en")
43+
pages.add_argument("--languages", default=",".join(DEFAULT_LANGUAGES))
44+
pages.add_argument("--site-name", default="HackTricks Cloud")
45+
46+
index_cmd = subparsers.add_parser("index")
47+
index_cmd.add_argument("--site-url", required=True)
48+
index_cmd.add_argument("--languages", required=True)
49+
index_cmd.add_argument("--output", required=True)
50+
51+
return parser.parse_args()
52+
53+
54+
def parse_languages(raw):
55+
langs = []
56+
for item in raw.split(","):
57+
code = item.strip()
58+
if re.fullmatch(r"[a-z]{2}", code):
59+
langs.append(code)
60+
return sorted(set(langs))
61+
62+
63+
def iter_html_files(book_dir):
64+
for html_file in sorted(Path(book_dir).rglob("*.html")):
65+
if html_file.name in SKIP_HTML:
66+
continue
67+
yield html_file
68+
69+
70+
def canonical_url(site_url, lang, rel_path):
71+
return f"{site_url.rstrip('/')}/{lang}/{rel_path.as_posix()}"
72+
73+
74+
def clean_text(fragment):
75+
fragment = re.sub(r"<script\b[^>]*>.*?</script>", " ", fragment, flags=re.I | re.S)
76+
fragment = re.sub(r"<style\b[^>]*>.*?</style>", " ", fragment, flags=re.I | re.S)
77+
fragment = re.sub(r"<[^>]+>", " ", fragment)
78+
fragment = html.unescape(fragment)
79+
fragment = re.sub(r"\s+", " ", fragment).strip()
80+
return fragment
81+
82+
83+
def trim_description(text, fallback):
84+
text = text or fallback
85+
text = re.sub(r"\s+", " ", text).strip()
86+
if len(text) <= 160:
87+
return text
88+
cut = text[:157]
89+
if " " in cut:
90+
cut = cut.rsplit(" ", 1)[0]
91+
return cut + "..."
92+
93+
94+
def extract_description(document, fallback):
95+
main_match = re.search(r"<main\b[^>]*>(.*?)</main>", document, flags=re.I | re.S)
96+
scope = main_match.group(1) if main_match else document
97+
98+
for pattern in (r"<p\b[^>]*>(.*?)</p>", r"<li\b[^>]*>(.*?)</li>", r"<h[12]\b[^>]*>(.*?)</h[12]>"):
99+
for match in re.finditer(pattern, scope, flags=re.I | re.S):
100+
text = clean_text(match.group(1))
101+
if len(text) >= 40:
102+
return trim_description(text, fallback)
103+
104+
return trim_description(clean_text(scope), fallback)
105+
106+
107+
def build_seo_block(site_url, lang, rel_path, languages, default_lang):
108+
current_url = canonical_url(site_url, lang, rel_path)
109+
lines = [SEO_START, f'<link rel="canonical" href="{html.escape(current_url, quote=True)}">']
110+
111+
for alt_lang in languages:
112+
alt_url = canonical_url(site_url, alt_lang, rel_path)
113+
lines.append(
114+
f'<link rel="alternate" hreflang="{alt_lang}" href="{html.escape(alt_url, quote=True)}">'
115+
)
116+
117+
default_url = canonical_url(site_url, default_lang, rel_path)
118+
lines.append(f'<link rel="alternate" hreflang="x-default" href="{html.escape(default_url, quote=True)}">')
119+
lines.append(SEO_END)
120+
return "\n ".join(lines)
121+
122+
123+
def update_document(document, site_url, lang, rel_path, languages, default_lang, site_name):
124+
title_match = re.search(r"<title>(.*?)</title>", document, flags=re.I | re.S)
125+
page_title = clean_text(title_match.group(1)) if title_match else site_name
126+
fallback_description = f"{site_name}: {page_title}"
127+
description = extract_description(document, fallback_description)
128+
seo_block = build_seo_block(site_url, lang, rel_path, languages, default_lang)
129+
130+
document = re.sub(
131+
r"\s*<!-- HT_SEO_START -->.*?<!-- HT_SEO_END -->\s*",
132+
"\n",
133+
document,
134+
flags=re.S,
135+
)
136+
137+
if re.search(r'<meta\s+name="description"\s+content="[^"]*"\s*/?>', document, flags=re.I):
138+
document = re.sub(
139+
r'(<meta\s+name="description"\s+content=")[^"]*("\s*/?>)',
140+
r"\1" + html.escape(description, quote=True) + r"\2",
141+
document,
142+
count=1,
143+
flags=re.I,
144+
)
145+
elif title_match:
146+
document = document.replace(
147+
title_match.group(0),
148+
title_match.group(0) + f'\n <meta name="description" content="{html.escape(description, quote=True)}">',
149+
1,
150+
)
151+
152+
document = re.sub(r"</head>", f" {seo_block}\n </head>", document, count=1, flags=re.I)
153+
return document
154+
155+
156+
def generate_language_sitemap(book_dir, site_url, lang, languages, default_lang):
157+
ET.register_namespace("", "http://www.sitemaps.org/schemas/sitemap/0.9")
158+
ET.register_namespace("xhtml", "http://www.w3.org/1999/xhtml")
159+
160+
urlset = ET.Element("{http://www.sitemaps.org/schemas/sitemap/0.9}urlset")
161+
162+
for html_file in iter_html_files(book_dir):
163+
rel_path = html_file.relative_to(book_dir)
164+
url = ET.SubElement(urlset, "{http://www.sitemaps.org/schemas/sitemap/0.9}url")
165+
ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text = canonical_url(
166+
site_url, lang, rel_path
167+
)
168+
lastmod = datetime.fromtimestamp(html_file.stat().st_mtime, tz=timezone.utc).date().isoformat()
169+
ET.SubElement(url, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = lastmod
170+
171+
for alt_lang in languages:
172+
ET.SubElement(
173+
url,
174+
"{http://www.w3.org/1999/xhtml}link",
175+
{
176+
"rel": "alternate",
177+
"hreflang": alt_lang,
178+
"href": canonical_url(site_url, alt_lang, rel_path),
179+
},
180+
)
181+
182+
ET.SubElement(
183+
url,
184+
"{http://www.w3.org/1999/xhtml}link",
185+
{
186+
"rel": "alternate",
187+
"hreflang": "x-default",
188+
"href": canonical_url(site_url, default_lang, rel_path),
189+
},
190+
)
191+
192+
tree = ET.ElementTree(urlset)
193+
output = Path(book_dir) / "sitemap.xml"
194+
tree.write(output, encoding="utf-8", xml_declaration=True)
195+
196+
197+
def process_pages(args):
198+
book_dir = Path(args.book_dir)
199+
languages = parse_languages(args.languages)
200+
201+
for html_file in iter_html_files(book_dir):
202+
rel_path = html_file.relative_to(book_dir)
203+
content = html_file.read_text(encoding="utf-8")
204+
updated = update_document(
205+
content,
206+
args.site_url,
207+
args.lang,
208+
rel_path,
209+
languages,
210+
args.default_lang,
211+
args.site_name,
212+
)
213+
html_file.write_text(updated, encoding="utf-8")
214+
215+
generate_language_sitemap(book_dir, args.site_url, args.lang, languages, args.default_lang)
216+
217+
218+
def generate_sitemap_index(args):
219+
ET.register_namespace("", "http://www.sitemaps.org/schemas/sitemap/0.9")
220+
sitemapindex = ET.Element("{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex")
221+
now = datetime.now(timezone.utc).date().isoformat()
222+
223+
for lang in parse_languages(args.languages):
224+
sitemap = ET.SubElement(sitemapindex, "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap")
225+
ET.SubElement(sitemap, "{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text = (
226+
f"{args.site_url.rstrip('/')}/{lang}/sitemap.xml"
227+
)
228+
ET.SubElement(sitemap, "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod").text = now
229+
230+
ET.ElementTree(sitemapindex).write(args.output, encoding="utf-8", xml_declaration=True)
231+
232+
233+
def main():
234+
args = parse_args()
235+
if args.command == "pages":
236+
process_pages(args)
237+
elif args.command == "index":
238+
generate_sitemap_index(args)
239+
240+
241+
if __name__ == "__main__":
242+
main()

src/robots.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Sitemap: https://www.hacktricks.wiki/sitemap.xml
1+
Sitemap: https://cloud.hacktricks.wiki/sitemap.xml
22

33
User-agent: *
4-
Disallow:
4+
Disallow:

theme/index.hbs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@
163163
<a class="menu-bar-link" href="https://hacktricks-training.com" target="_blank">
164164
HT Training
165165
</a>
166-
<a class="menu-bar-link" href="https://book.hacktricks.wiki/" target="_blank">
166+
<a class="menu-bar-link" href="https://hacktricks.wiki/" target="_blank">
167167
HT Book
168168
</a>
169169
<a class="menu-bar-link" href="https://tools.hacktricks.wiki/" target="_blank">
@@ -184,7 +184,7 @@
184184
<span class="menu-hamburger" aria-hidden="true">≡</span>
185185
<div id="menubar-collapse-popup" class="menubar-collapse-popup" aria-label="Menu" role="menu">
186186
<a href="https://hacktricks-training.com" target="_blank" role="menuitem" class="menu-bar-link">HT Training</a>
187-
<a href="https://book.hacktricks.wiki/" target="_blank" role="menuitem" class="menu-bar-link">Book HT</a>
187+
<a href="https://hacktricks.wiki/" target="_blank" role="menuitem" class="menu-bar-link">Book HT</a>
188188
<a href="https://tools.hacktricks.wiki/" target="_blank" role="menuitem" class="menu-bar-link">HT Tools</a>
189189
<a href="https://github.com/sponsors/carlospolop" target="_blank" role="menuitem" class="menu-bar-link">Sponsor</a>
190190
<a href="https://www.linkedin.com/company/hacktricks" target="_blank" role="menuitem" class="menu-bar-link">Linkedin</a>

0 commit comments

Comments
 (0)