Export posts out of NodeBB into HTML and Markdown flat files -> Halo ITSM

phenomlab · 4 Sept 2025, 17:20

At work, we are transitioning from NodeBB for our Knowledge Base to Halo ITSM, which we require for SOC2 compliance amongst other things. Because I had 165 articles in NodeBB I didn’t want to have to re-type, or even copy and paste, I decided to write a Python script to walk the target category and create a file for each.

Here’s the script to complete that. There are a number of prerequisities here, which I’ve identified below

 import os
import re
import time
import requests
import html2text
from datetime import datetime
 
# --- CONFIGURATION ---
# Your Forum URL goes here
BASE_URL = "https:/yourforum.com"
#The category ID you want to target goes here
CATEGORY_ID = 3
# In my case, I needed to define a new "home" for the exported files under `/public/uploads` as this contained all the images I needed to embed into the new flat files. Therefore, ASSET_DOMAIN is nothing more than a basic website where I can grab the images from afterwards.
ASSET_DOMAIN = "https://assetlocation.com"
# The below directories are created at the same level as the script. If they do not exist, you need to create them. They will contain both `HTML`  and `markdown` copies of the posts.
HTML_DIR = "nodebb_export_html"
MD_DIR = "nodebb_export_markdown"
os.makedirs(HTML_DIR, exist_ok=True)
os.makedirs(MD_DIR, exist_ok=True)
 
h = html2text.HTML2Text()
h.ignore_links = False
h.body_width = 0
 
page = 1
total_exported = 0
 
print(f"🔄 Starting export for category {CATEGORY_ID} from {BASE_URL}")
 
while True:
    print(f"📄 Fetching page {page}...")
    url = f"{BASE_URL}/api/category/{CATEGORY_ID}?page={page}"
    res = requests.get(url, timeout=10)
    if res.status_code != 200:
        print(f"❌ Failed to fetch page {page}: {res.status_code}")
        break
 
    data = res.json()
    topics = data.get("topics", [])
    if not topics:
        print("✅ No more topics found. Export complete.")
        break
 
    for topic in topics:
        tid = topic['tid']
        title = topic['title']
        print(f"→ Exporting topic {tid}: {title}")
 
        topic_url = f"{BASE_URL}/api/topic/{tid}"
        topic_res = requests.get(topic_url, timeout=10)
        if topic_res.status_code != 200:
            print(f"⚠️ Failed to fetch topic {tid}")
            continue
 
        topic_data = topic_res.json()
        posts = topic_data.get("posts", [])
        tags = topic_data.get("topic", {}).get("tags", [])
        tag_list = ", ".join(tags) if tags else ""
 
        safe_title = title.replace(' ', '_').replace('/', '-')
        html_file = f"{HTML_DIR}/{tid}-{safe_title}.html"
        md_file = f"{MD_DIR}/{tid}-{safe_title}.md"
 
        # --- HTML EXPORT ---
        with open(html_file, "w", encoding="utf-8") as f_html:
            f_html.write(f"<html><head><title>{title}</title></head><body>\n")
            f_html.write(f"<h1>{title}</h1>\n")
            if tag_list:
                f_html.write(f"<p><strong>Tags:</strong> {tag_list}</p>\n")
 
            for post in posts:
                username = post['user']['username']
                content_html = post['content']
                timestamp = datetime.utcfromtimestamp(post['timestamp'] / 1000).strftime('%Y-%m-%d %H:%M:%S UTC')
                pid = post['pid']
 
                # Rewrite asset paths in HTML
                content_html = re.sub(
                    r'src=["\'](/assets/uploads/files/.*?)["\']',
                    rf'src="{ASSET_DOMAIN}\1"',
                    content_html
                )
                content_html = re.sub(
                    r'href=["\'](/assets/uploads/files/.*?)["\']',
                    rf'href="{ASSET_DOMAIN}\1"',
                    content_html
                )
 
                f_html.write(f"<div class='post'>\n")
                f_html.write(f"<h3><strong>Original Author: {username}</strong></h3>\n")
                f_html.write(f"<p><em>Posted on: {timestamp} &nbsp;|&nbsp; Post ID: {pid}</em></p>\n")
                f_html.write(f"{content_html}\n")
                f_html.write("<hr/>\n</div>\n")
 
            f_html.write("</body></html>\n")
 
        # --- MARKDOWN EXPORT ---
        with open(md_file, "w", encoding="utf-8") as f_md:
            # Metadata block
            f_md.write(f"<!-- FAQLists: Knowledge Base -->\n")
            if tag_list:
                f_md.write(f"<!-- Tags: {tag_list} -->\n")
            f_md.write("\n")
 
            f_md.write(f"# {title}\n\n")
 
            for post in posts:
                username = post['user']['username']
                content_html = post['content']
                timestamp = datetime.utcfromtimestamp(post['timestamp'] / 1000).strftime('%Y-%m-%d %H:%M:%S UTC')
                pid = post['pid']
 
                # Convert HTML to Markdown
                content_md = h.handle(content_html).strip()
 
                # Rewrite asset paths
                content_md = re.sub(
                    r'(!\[.*?\])\((/assets/uploads/files/.*?)\)',
                    rf'\1({ASSET_DOMAIN}\2)',
                    content_md
                )
                content_md = re.sub(
                    r'(\[.*?\])\((/assets/uploads/files/.*?)\)',
                    rf'\1({ASSET_DOMAIN}\2)',
                    content_md
                )
 
                f_md.write(f"**Original Author: {username}**\n\n")
                f_md.write(f"_Posted on: {timestamp}  |  Post ID: {pid}_\n\n")
                f_md.write(f"{content_md}\n\n---\n\n")
 
        total_exported += 1
        print(f"✔ Saved: {html_file} & {md_file}")
 
    page += 1
    time.sleep(1)
 
print(f"\n🎉 Done! Exported {total_exported} topics to '{HTML_DIR}' and '{MD_DIR}'")

Run the script using python scriptname.py.

If the script fails, it’s likely because you do not have the required modules installed in Python

 import os
import re
import time
import requests
import html2text

In this case, you’d need to install them using (for example) pip install html2text

To get them into an Excel file where they can all be bulk imported, we’d then use something like the below script

 import os
import re
import pandas as pd
from datetime import datetime
import markdown
 
# --- CONFIGURATION ---
export_dir = "nodebb_export_markdown"
output_file = "Halo_KB_Import_HTML.xlsx"
# This value can be whatever suits your needs
created_by = "Import"
today = datetime.today().strftime('%Y-%m-%d')
 
# --- BUILD DATAFRAME FOR HALO ---
import_rows = []
 
for filename in sorted(os.listdir(export_dir)):
    if filename.endswith(".md"):
        filepath = os.path.join(export_dir, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            lines = f.readlines()
 
        # Default values
# Change "Knowledge Base" to reflect what you are using in Halo
        faqlists = "Knowledge Base"
        tags = ""
 
        # Parse metadata comments from top of file
        metadata_lines = []
        while lines and lines[0].startswith("<!--"):
            metadata_lines.append(lines.pop(0).strip())
 
        for line in metadata_lines:
            faq_match = re.match(r"<!-- FAQLists:\s*(.*?)\s*-->", line)
            tag_match = re.match(r"<!-- Tags:\s*(.*?)\s*-->", line)
 
            if faq_match:
                faqlists = faq_match.group(1)
            if tag_match:
                tags = tag_match.group(1)
 
        markdown_content = ''.join(lines)
        html_content = markdown.markdown(markdown_content)
 
        # Extract summary from filename
        summary = filename.split('-', 1)[1].rsplit('.md', 1)[0].replace('_', ' ')
 
        import_rows.append({
            "Summary": summary,
            "Details": html_content,
            "Resolution": "",
            "DateAdded": today,
            "CreatedBy": created_by,
            "FAQLists": faqlists,
            "Tags": tags
        })
 
# --- EXPORT TO EXCEL ---
df = pd.DataFrame(import_rows)
df.to_excel(output_file, index=False)
 
print(f"✅ Done! Halo HTML import file created: {output_file}")

This then generates a file called Halo_KB_Import_HTML.xlsx which you can then use to import each exported post into Halo.

Cool eh? Huge time saver

sudonix

Export posts out of NodeBB into HTML and Markdown flat files -> Halo ITSM

Related Topics

What is this bar called?

Can you adjust Admin settings on your NodeBB?

Further Widgets question

NodeBB: updating Admin details not working

Chevron up before & after

Podcast Share NodeBB

Changing Background on NodeBB

Recent Cards plugin customization

	import os
	import re
	import time
	import requests
	import html2text
	from datetime import datetime

	# --- CONFIGURATION ---
	# Your Forum URL goes here
	BASE_URL = "https:/yourforum.com"
	#The category ID you want to target goes here
	CATEGORY_ID = 3
	# In my case, I needed to define a new "home" for the exported files under `/public/uploads` as this contained all the images I needed to embed into the new flat files. Therefore, ASSET_DOMAIN is nothing more than a basic website where I can grab the images from afterwards.
	ASSET_DOMAIN = "https://assetlocation.com"
	# The below directories are created at the same level as the script. If they do not exist, you need to create them. They will contain both `HTML` and `markdown` copies of the posts.
	HTML_DIR = "nodebb_export_html"
	MD_DIR = "nodebb_export_markdown"
	os.makedirs(HTML_DIR, exist_ok=True)
	os.makedirs(MD_DIR, exist_ok=True)

	h = html2text.HTML2Text()
	h.ignore_links = False
	h.body_width = 0

	page = 1
	total_exported = 0

	print(f"🔄 Starting export for category {CATEGORY_ID} from {BASE_URL}")

	while True:
	print(f"📄 Fetching page {page}...")
	url = f"{BASE_URL}/api/category/{CATEGORY_ID}?page={page}"
	res = requests.get(url, timeout=10)
	if res.status_code != 200:
	print(f"❌ Failed to fetch page {page}: {res.status_code}")
	break

	data = res.json()
	topics = data.get("topics", [])
	if not topics:
	print("✅ No more topics found. Export complete.")
	break

	for topic in topics:
	tid = topic['tid']
	title = topic['title']
	print(f"→ Exporting topic {tid}: {title}")

	topic_url = f"{BASE_URL}/api/topic/{tid}"
	topic_res = requests.get(topic_url, timeout=10)
	if topic_res.status_code != 200:
	print(f"⚠️ Failed to fetch topic {tid}")
	continue

	topic_data = topic_res.json()
	posts = topic_data.get("posts", [])
	tags = topic_data.get("topic", {}).get("tags", [])
	tag_list = ", ".join(tags) if tags else ""

	safe_title = title.replace(' ', '_').replace('/', '-')
	html_file = f"{HTML_DIR}/{tid}-{safe_title}.html"
	md_file = f"{MD_DIR}/{tid}-{safe_title}.md"

	# --- HTML EXPORT ---
	with open(html_file, "w", encoding="utf-8") as f_html:
	f_html.write(f"<html><head><title>{title}</title></head><body>\n")
	f_html.write(f"<h1>{title}</h1>\n")
	if tag_list:
	f_html.write(f"<p><strong>Tags:</strong> {tag_list}</p>\n")

	for post in posts:
	username = post['user']['username']
	content_html = post['content']
	timestamp = datetime.utcfromtimestamp(post['timestamp'] / 1000).strftime('%Y-%m-%d %H:%M:%S UTC')
	pid = post['pid']

	# Rewrite asset paths in HTML
	content_html = re.sub(
	r'src=["\'](/assets/uploads/files/.*?)["\']',
	rf'src="{ASSET_DOMAIN}\1"',
	content_html
	)
	content_html = re.sub(
	r'href=["\'](/assets/uploads/files/.*?)["\']',
	rf'href="{ASSET_DOMAIN}\1"',
	content_html
	)

	f_html.write(f"<div class='post'>\n")
	f_html.write(f"<h3><strong>Original Author: {username}</strong></h3>\n")
	f_html.write(f"<p><em>Posted on: {timestamp}  \|  Post ID: {pid}</em></p>\n")
	f_html.write(f"{content_html}\n")
	f_html.write("<hr/>\n</div>\n")

	f_html.write("</body></html>\n")

	# --- MARKDOWN EXPORT ---
	with open(md_file, "w", encoding="utf-8") as f_md:
	# Metadata block
	f_md.write(f"<!-- FAQLists: Knowledge Base -->\n")
	if tag_list:
	f_md.write(f"<!-- Tags: {tag_list} -->\n")
	f_md.write("\n")

	f_md.write(f"# {title}\n\n")

	for post in posts:
	username = post['user']['username']
	content_html = post['content']
	timestamp = datetime.utcfromtimestamp(post['timestamp'] / 1000).strftime('%Y-%m-%d %H:%M:%S UTC')
	pid = post['pid']

	# Convert HTML to Markdown
	content_md = h.handle(content_html).strip()

	# Rewrite asset paths
	content_md = re.sub(
	r'(!\[.?\])\((/assets/uploads/files/.?)\)',
	rf'\1({ASSET_DOMAIN}\2)',
	content_md
	)
	content_md = re.sub(
	r'(\[.?\])\((/assets/uploads/files/.?)\)',
	rf'\1({ASSET_DOMAIN}\2)',
	content_md
	)

	f_md.write(f"Original Author: {username}\n\n")
	f_md.write(f"_Posted on: {timestamp} \| Post ID: {pid}_\n\n")
	f_md.write(f"{content_md}\n\n---\n\n")

	total_exported += 1
	print(f"✔ Saved: {html_file} & {md_file}")

	page += 1
	time.sleep(1)

	print(f"\n🎉 Done! Exported {total_exported} topics to '{HTML_DIR}' and '{MD_DIR}'")

	import os
	import re
	import pandas as pd
	from datetime import datetime
	import markdown

	# --- CONFIGURATION ---
	export_dir = "nodebb_export_markdown"
	output_file = "Halo_KB_Import_HTML.xlsx"
	# This value can be whatever suits your needs
	created_by = "Import"
	today = datetime.today().strftime('%Y-%m-%d')

	# --- BUILD DATAFRAME FOR HALO ---
	import_rows = []

	for filename in sorted(os.listdir(export_dir)):
	if filename.endswith(".md"):
	filepath = os.path.join(export_dir, filename)
	with open(filepath, "r", encoding="utf-8") as f:
	lines = f.readlines()

	# Default values
	# Change "Knowledge Base" to reflect what you are using in Halo
	faqlists = "Knowledge Base"
	tags = ""

	# Parse metadata comments from top of file
	metadata_lines = []
	while lines and lines[0].startswith("<!--"):
	metadata_lines.append(lines.pop(0).strip())

	for line in metadata_lines:
	faq_match = re.match(r"<!-- FAQLists:\s(.?)\s*-->", line)
	tag_match = re.match(r"<!-- Tags:\s(.?)\s*-->", line)

	if faq_match:
	faqlists = faq_match.group(1)
	if tag_match:
	tags = tag_match.group(1)

	markdown_content = ''.join(lines)
	html_content = markdown.markdown(markdown_content)

	# Extract summary from filename
	summary = filename.split('-', 1)[1].rsplit('.md', 1)[0].replace('_', ' ')

	import_rows.append({
	"Summary": summary,
	"Details": html_content,
	"Resolution": "",
	"DateAdded": today,
	"CreatedBy": created_by,
	"FAQLists": faqlists,
	"Tags": tags
	})

	# --- EXPORT TO EXCEL ---
	df = pd.DataFrame(import_rows)
	df.to_excel(output_file, index=False)

	print(f"✅ Done! Halo HTML import file created: {output_file}")

sudonix

Export posts out of NodeBB into HTML and Markdown flat files -> Halo ITSM

Related Topics

Individual Categories