Export posts out of NodeBB into HTML and Markdown flat files -> Halo ITSM
-
At work, we are transitioning from NodeBB for our Knowledge Base to Halo ITSM, which we require for SOC2 compliance amongst other things. Because I had 165 articles in NodeBB I didn’t want to have to re-type, or even copy and paste, I decided to write a Python script to walk the target category and create a file for each.
Here’s the script to complete that. There are a number of prerequisities here, which I’ve identified below
import os import re import time import requests import html2text from datetime import datetime # --- CONFIGURATION --- # Your Forum URL goes here BASE_URL = "https:/yourforum.com" #The category ID you want to target goes here CATEGORY_ID = 3 # In my case, I needed to define a new "home" for the exported files under `/public/uploads` as this contained all the images I needed to embed into the new flat files. Therefore, ASSET_DOMAIN is nothing more than a basic website where I can grab the images from afterwards. ASSET_DOMAIN = "https://assetlocation.com" # The below directories are created at the same level as the script. If they do not exist, you need to create them. They will contain both `HTML` and `markdown` copies of the posts. HTML_DIR = "nodebb_export_html" MD_DIR = "nodebb_export_markdown" os.makedirs(HTML_DIR, exist_ok=True) os.makedirs(MD_DIR, exist_ok=True) h = html2text.HTML2Text() h.ignore_links = False h.body_width = 0 page = 1 total_exported = 0 print(f"🔄 Starting export for category {CATEGORY_ID} from {BASE_URL}") while True: print(f"📄 Fetching page {page}...") url = f"{BASE_URL}/api/category/{CATEGORY_ID}?page={page}" res = requests.get(url, timeout=10) if res.status_code != 200: print(f"❌ Failed to fetch page {page}: {res.status_code}") break data = res.json() topics = data.get("topics", []) if not topics: print("✅ No more topics found. Export complete.") break for topic in topics: tid = topic['tid'] title = topic['title'] print(f"→ Exporting topic {tid}: {title}") topic_url = f"{BASE_URL}/api/topic/{tid}" topic_res = requests.get(topic_url, timeout=10) if topic_res.status_code != 200: print(f"⚠️ Failed to fetch topic {tid}") continue topic_data = topic_res.json() posts = topic_data.get("posts", []) tags = topic_data.get("topic", {}).get("tags", []) tag_list = ", ".join(tags) if tags else "" safe_title = title.replace(' ', '_').replace('/', '-') html_file = f"{HTML_DIR}/{tid}-{safe_title}.html" md_file = f"{MD_DIR}/{tid}-{safe_title}.md" # --- HTML EXPORT --- with open(html_file, "w", encoding="utf-8") as f_html: f_html.write(f"<html><head><title>{title}</title></head><body>\n") f_html.write(f"<h1>{title}</h1>\n") if tag_list: f_html.write(f"<p><strong>Tags:</strong> {tag_list}</p>\n") for post in posts: username = post['user']['username'] content_html = post['content'] timestamp = datetime.utcfromtimestamp(post['timestamp'] / 1000).strftime('%Y-%m-%d %H:%M:%S UTC') pid = post['pid'] # Rewrite asset paths in HTML content_html = re.sub( r'src=["\'](/assets/uploads/files/.*?)["\']', rf'src="{ASSET_DOMAIN}\1"', content_html ) content_html = re.sub( r'href=["\'](/assets/uploads/files/.*?)["\']', rf'href="{ASSET_DOMAIN}\1"', content_html ) f_html.write(f"<div class='post'>\n") f_html.write(f"<h3><strong>Original Author: {username}</strong></h3>\n") f_html.write(f"<p><em>Posted on: {timestamp} | Post ID: {pid}</em></p>\n") f_html.write(f"{content_html}\n") f_html.write("<hr/>\n</div>\n") f_html.write("</body></html>\n") # --- MARKDOWN EXPORT --- with open(md_file, "w", encoding="utf-8") as f_md: # Metadata block f_md.write(f"<!-- FAQLists: Knowledge Base -->\n") if tag_list: f_md.write(f"<!-- Tags: {tag_list} -->\n") f_md.write("\n") f_md.write(f"# {title}\n\n") for post in posts: username = post['user']['username'] content_html = post['content'] timestamp = datetime.utcfromtimestamp(post['timestamp'] / 1000).strftime('%Y-%m-%d %H:%M:%S UTC') pid = post['pid'] # Convert HTML to Markdown content_md = h.handle(content_html).strip() # Rewrite asset paths content_md = re.sub( r'(!\[.*?\])\((/assets/uploads/files/.*?)\)', rf'\1({ASSET_DOMAIN}\2)', content_md ) content_md = re.sub( r'(\[.*?\])\((/assets/uploads/files/.*?)\)', rf'\1({ASSET_DOMAIN}\2)', content_md ) f_md.write(f"**Original Author: {username}**\n\n") f_md.write(f"_Posted on: {timestamp} | Post ID: {pid}_\n\n") f_md.write(f"{content_md}\n\n---\n\n") total_exported += 1 print(f"✔ Saved: {html_file} & {md_file}") page += 1 time.sleep(1) print(f"\n🎉 Done! Exported {total_exported} topics to '{HTML_DIR}' and '{MD_DIR}'") Run the script using
python scriptname.py
.If the script fails, it’s likely because you do not have the required modules installed in Python
import os import re import time import requests import html2text In this case, you’d need to install them using (for example)
pip install html2text
To get them into an Excel file where they can all be bulk imported, we’d then use something like the below script
import os import re import pandas as pd from datetime import datetime import markdown # --- CONFIGURATION --- export_dir = "nodebb_export_markdown" output_file = "Halo_KB_Import_HTML.xlsx" # This value can be whatever suits your needs created_by = "Import" today = datetime.today().strftime('%Y-%m-%d') # --- BUILD DATAFRAME FOR HALO --- import_rows = [] for filename in sorted(os.listdir(export_dir)): if filename.endswith(".md"): filepath = os.path.join(export_dir, filename) with open(filepath, "r", encoding="utf-8") as f: lines = f.readlines() # Default values # Change "Knowledge Base" to reflect what you are using in Halo faqlists = "Knowledge Base" tags = "" # Parse metadata comments from top of file metadata_lines = [] while lines and lines[0].startswith("<!--"): metadata_lines.append(lines.pop(0).strip()) for line in metadata_lines: faq_match = re.match(r"<!-- FAQLists:\s*(.*?)\s*-->", line) tag_match = re.match(r"<!-- Tags:\s*(.*?)\s*-->", line) if faq_match: faqlists = faq_match.group(1) if tag_match: tags = tag_match.group(1) markdown_content = ''.join(lines) html_content = markdown.markdown(markdown_content) # Extract summary from filename summary = filename.split('-', 1)[1].rsplit('.md', 1)[0].replace('_', ' ') import_rows.append({ "Summary": summary, "Details": html_content, "Resolution": "", "DateAdded": today, "CreatedBy": created_by, "FAQLists": faqlists, "Tags": tags }) # --- EXPORT TO EXCEL --- df = pd.DataFrame(import_rows) df.to_excel(output_file, index=False) print(f"✅ Done! Halo HTML import file created: {output_file}") This then generates a file called
Halo_KB_Import_HTML.xlsx
which you can then use to import each exported post into Halo.Cool eh? Huge time saver
Hello! It looks like you're interested in this conversation, but you don't have an account yet.
Getting fed up of having to scroll through the same posts each visit? When you register for an account, you'll always come back to exactly where you were before, and choose to be notified of new replies (ether email, or push notification). You'll also be able to save bookmarks, use reactions, and upvote to show your appreciation to other community members.
With your input, this post could be even better 💗
RegisterLog in