Skip to content

Export posts out of NodeBB into HTML and Markdown flat files -> Halo ITSM

Guides
1 1 575 1
  • At work, we are transitioning from NodeBB for our Knowledge Base to Halo ITSM, which we require for SOC2 compliance amongst other things. Because I had 165 articles in NodeBB I didn’t want to have to re-type, or even copy and paste, I decided to write a Python script to walk the target category and create a file for each.

    Here’s the script to complete that. There are a number of prerequisities here, which I’ve identified below

    import os
    import re
    import time
    import requests
    import html2text
    from datetime import datetime
    
    # --- CONFIGURATION ---
    # Your Forum URL goes here
    BASE_URL = "https:/yourforum.com"
    #The category ID you want to target goes here
    CATEGORY_ID = 3
    # In my case, I needed to define a new "home" for the exported files under `/public/uploads` as this contained all the images I needed to embed into the new flat files. Therefore, ASSET_DOMAIN is nothing more than a basic website where I can grab the images from afterwards.
    ASSET_DOMAIN = "https://assetlocation.com"
    # The below directories are created at the same level as the script. If they do not exist, you need to create them. They will contain both `HTML`  and `markdown` copies of the posts.
    HTML_DIR = "nodebb_export_html"
    MD_DIR = "nodebb_export_markdown"
    os.makedirs(HTML_DIR, exist_ok=True)
    os.makedirs(MD_DIR, exist_ok=True)
    
    h = html2text.HTML2Text()
    h.ignore_links = False
    h.body_width = 0
    
    page = 1
    total_exported = 0
    
    print(f"🔄 Starting export for category {CATEGORY_ID} from {BASE_URL}")
    
    while True:
        print(f"📄 Fetching page {page}...")
        url = f"{BASE_URL}/api/category/{CATEGORY_ID}?page={page}"
        res = requests.get(url, timeout=10)
        if res.status_code != 200:
            print(f"❌ Failed to fetch page {page}: {res.status_code}")
            break
    
        data = res.json()
        topics = data.get("topics", [])
        if not topics:
            print("✅ No more topics found. Export complete.")
            break
    
        for topic in topics:
            tid = topic['tid']
            title = topic['title']
            print(f"→ Exporting topic {tid}: {title}")
    
            topic_url = f"{BASE_URL}/api/topic/{tid}"
            topic_res = requests.get(topic_url, timeout=10)
            if topic_res.status_code != 200:
                print(f"⚠️ Failed to fetch topic {tid}")
                continue
    
            topic_data = topic_res.json()
            posts = topic_data.get("posts", [])
            tags = topic_data.get("topic", {}).get("tags", [])
            tag_list = ", ".join(tags) if tags else ""
    
            safe_title = title.replace(' ', '_').replace('/', '-')
            html_file = f"{HTML_DIR}/{tid}-{safe_title}.html"
            md_file = f"{MD_DIR}/{tid}-{safe_title}.md"
    
            # --- HTML EXPORT ---
            with open(html_file, "w", encoding="utf-8") as f_html:
                f_html.write(f"<html><head><title>{title}</title></head><body>\n")
                f_html.write(f"<h1>{title}</h1>\n")
                if tag_list:
                    f_html.write(f"<p><strong>Tags:</strong> {tag_list}</p>\n")
    
                for post in posts:
                    username = post['user']['username']
                    content_html = post['content']
                    timestamp = datetime.utcfromtimestamp(post['timestamp'] / 1000).strftime('%Y-%m-%d %H:%M:%S UTC')
                    pid = post['pid']
    
                    # Rewrite asset paths in HTML
                    content_html = re.sub(
                        r'src=["\'](/assets/uploads/files/.*?)["\']',
                        rf'src="{ASSET_DOMAIN}\1"',
                        content_html
                    )
                    content_html = re.sub(
                        r'href=["\'](/assets/uploads/files/.*?)["\']',
                        rf'href="{ASSET_DOMAIN}\1"',
                        content_html
                    )
    
                    f_html.write(f"<div class='post'>\n")
                    f_html.write(f"<h3><strong>Original Author: {username}</strong></h3>\n")
                    f_html.write(f"<p><em>Posted on: {timestamp} &nbsp;|&nbsp; Post ID: {pid}</em></p>\n")
                    f_html.write(f"{content_html}\n")
                    f_html.write("<hr/>\n</div>\n")
    
                f_html.write("</body></html>\n")
    
            # --- MARKDOWN EXPORT ---
            with open(md_file, "w", encoding="utf-8") as f_md:
                # Metadata block
                f_md.write(f"<!-- FAQLists: Knowledge Base -->\n")
                if tag_list:
                    f_md.write(f"<!-- Tags: {tag_list} -->\n")
                f_md.write("\n")
    
                f_md.write(f"# {title}\n\n")
    
                for post in posts:
                    username = post['user']['username']
                    content_html = post['content']
                    timestamp = datetime.utcfromtimestamp(post['timestamp'] / 1000).strftime('%Y-%m-%d %H:%M:%S UTC')
                    pid = post['pid']
    
                    # Convert HTML to Markdown
                    content_md = h.handle(content_html).strip()
    
                    # Rewrite asset paths
                    content_md = re.sub(
                        r'(!\[.*?\])\((/assets/uploads/files/.*?)\)',
                        rf'\1({ASSET_DOMAIN}\2)',
                        content_md
                    )
                    content_md = re.sub(
                        r'(\[.*?\])\((/assets/uploads/files/.*?)\)',
                        rf'\1({ASSET_DOMAIN}\2)',
                        content_md
                    )
    
                    f_md.write(f"**Original Author: {username}**\n\n")
                    f_md.write(f"_Posted on: {timestamp}  |  Post ID: {pid}_\n\n")
                    f_md.write(f"{content_md}\n\n---\n\n")
    
            total_exported += 1
            print(f"✔ Saved: {html_file} & {md_file}")
    
        page += 1
        time.sleep(1)
    
    print(f"\n🎉 Done! Exported {total_exported} topics to '{HTML_DIR}' and '{MD_DIR}'")
    
    

    Run the script using python scriptname.py.

    If the script fails, it’s likely because you do not have the required modules installed in Python

    import os
    import re
    import time
    import requests
    import html2text
    

    In this case, you’d need to install them using (for example) pip install html2text

    To get them into an Excel file where they can all be bulk imported, we’d then use something like the below script

    import os
    import re
    import pandas as pd
    from datetime import datetime
    import markdown
    
    # --- CONFIGURATION ---
    export_dir = "nodebb_export_markdown"
    output_file = "Halo_KB_Import_HTML.xlsx"
    # This value can be whatever suits your needs
    created_by = "Import"
    today = datetime.today().strftime('%Y-%m-%d')
    
    # --- BUILD DATAFRAME FOR HALO ---
    import_rows = []
    
    for filename in sorted(os.listdir(export_dir)):
        if filename.endswith(".md"):
            filepath = os.path.join(export_dir, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                lines = f.readlines()
    
            # Default values
    # Change "Knowledge Base" to reflect what you are using in Halo
            faqlists = "Knowledge Base"
            tags = ""
    
            # Parse metadata comments from top of file
            metadata_lines = []
            while lines and lines[0].startswith("<!--"):
                metadata_lines.append(lines.pop(0).strip())
    
            for line in metadata_lines:
                faq_match = re.match(r"<!-- FAQLists:\s*(.*?)\s*-->", line)
                tag_match = re.match(r"<!-- Tags:\s*(.*?)\s*-->", line)
    
                if faq_match:
                    faqlists = faq_match.group(1)
                if tag_match:
                    tags = tag_match.group(1)
    
            markdown_content = ''.join(lines)
            html_content = markdown.markdown(markdown_content)
    
            # Extract summary from filename
            summary = filename.split('-', 1)[1].rsplit('.md', 1)[0].replace('_', ' ')
    
            import_rows.append({
                "Summary": summary,
                "Details": html_content,
                "Resolution": "",
                "DateAdded": today,
                "CreatedBy": created_by,
                "FAQLists": faqlists,
                "Tags": tags
            })
    
    # --- EXPORT TO EXCEL ---
    df = pd.DataFrame(import_rows)
    df.to_excel(output_file, index=False)
    
    print(f"✅ Done! Halo HTML import file created: {output_file}")
    

    This then generates a file called Halo_KB_Import_HTML.xlsx which you can then use to import each exported post into Halo.

    Cool eh? Huge time saver 🙂


Related Topics
  • Test of youtube embeds

    Solved Configure nodebb
    14
    11 Votes
    14 Posts
    2k Views
    @phenomlab Perfect!!! Many thanks.
  • What is this bar called?

    Solved Customisation nodebb
    92
    1
    36 Votes
    92 Posts
    22k Views
    This is good
  • Removing blue 'moved' tag from post

    Solved Configure nodebb
    16
    2
    3 Votes
    16 Posts
    3k Views
    @phenomlab Ah, got it working! I reversed the CSS addition to put z index high, and then I could see another error box saying fork title must be at least 3 characters. So made the new fork title longer and button responded.
  • Where are widgets stored?

    Solved Configure nodebb
    3
    1 Votes
    3 Posts
    878 Views
    @phenomlab Thanks, have DMed you
  • 2 Votes
    6 Posts
    1k Views
    @dave1904 I’d start by adding a console.log function to hookData so you can see what is being returned return hookData; console.log(hookData):
  • NodeBB: Favicon upload issue

    Solved Configure nodebb favicon
    12
    1
    3 Votes
    12 Posts
    2k Views
    @phenomlab I am on a Mac, so I used the “Option + Command + I”, and then performed the steps. It loaded my favicon! I checked on Firefox which I haven’t used before, and it showed my favicon also! That’s fantastic and thank you for the help!
  • [NODEBB] Help for my custom CSS

    Solved Customisation nodebb css bugfix
    237
    49 Votes
    237 Posts
    86k Views
    @baris said: You should change your selectors so it doesn’t look at the entire document. You probably only want to apply fancybox to stuff inside the #content element which is what changes when the user navigates around the page. So use $('#content a').... for your selectors then the forum logo in the header won’t be selected. I modified the JS Fancybox code now and this code and it seem better // --------------------------------------------- // Fancybox Media Reader (Without Website Logo) // --------------------------------------------- if (top.location.pathname !== '/login') { $(window).on('action:posts.loaded', function(data) { console.log("Polling DOM for lazyLoaded images to apply Fancybox"); $(document).ready(function() { $('#content a').not('.forum-logo').not(".avatar").not(".emoji").not(".bmac-noanimate").each(function() { $('#content a[href*=".jpg"], #content a[href*=".jpeg"], #content a[href*=".png"], #content a[href*=".gif"], #content a[href*=".webp"]').addClass("noanimate"); }); }); }); } if (top.location.pathname !== '/login') { $(document).ready(function() { $(window).on('action:ajaxify.end', function(data) { $('#content a').not('.logo').not(".avatar").not(".emoji").not(".bmac-noanimate").each(function() { $('#content a[href*=".jpg"], #content a[href*=".jpeg"], #content a[href*=".png"], #content a[href*=".gif"], #content a[href*=".webp"]').addClass("noanimate"); data.preventDefault() // Strip out the images contained inside blockquotes as this looks nasty :) $('#content blockquote img').remove(); }); Fancybox.bind( '#content a[href*=".jpg"], #content a[href*=".jpeg"], #content a[href*=".png"], #content a[href*=".gif"], #content a[href*=".webp"]', { groupAll: true, } ); }); }); } // Chat fancybox - fires when chat module loaded and AJAX calls new chat $(document).ready(function() { $(window).on('action:chat.loaded', function(data) { // >>> Se limiter au contenu principal uniquement <<< $('#content img').not('.forum-logo').not(".avatar").not(".emoji").not(".bmac-noanimate").each(function() { var newHref = $(this).attr("src"); $(this).wrap("<a class='fancybox' href='" + newHref + "'/>"); $('#content a[href*=".jpg"], #content a[href*=".jpeg"], #content a[href*=".png"], #content a[href*=".gif"], #content a[href*=".webp"]').addClass("noanimate"); data.preventDefault(); // Strip out the images contained inside blockquotes as this looks nasty :) $('#content blockquote img').remove(); }); Fancybox.bind( '#content a[href*=".jpg"], #content a[href*=".jpeg"], #content a[href*=".png"], #content a[href*=".gif"], #content a[href*=".webp"]', { groupAll: true, } ); }); }); For the logo, I must use overflow: visible !important; on [component="brand/logo"] /* --- Logo --- */ [component="brand/logo"] { max-height: 50px; width: auto; height: auto; max-width: 100%; display: block; object-fit: contain; object-position: left center; overflow: visible !important; } Better result !!
  • NodeBB Design help

    Solved Customisation
    8
    3
    2 Votes
    8 Posts
    2k Views
    @riekmedia I’ve applied some new CSS to your site. Can you reload the page and try again ? For the record, this is what I added #footer { background: #2d343e; border-top: 4px solid #2d343e; font-size: 0.9em; margin-top: 70px; padding: 80px 0 0; position: relative; clear: both; bottom: 0; left: 0; right: 0; z-index: 1000; margin-left: -15px; margin-right: -338px; } The /categories page seems a bit messed up, so looking at that currently EDIT - issued some override CSS in the CATEGORIES widget <!--- CSS fix for overspill on /categories page - DO NOT DELETE --> <style> #footer { margin-right: -45px; } </style> That should resolve the /categories issue.