mesa/convert-sphinx.py

import os, glob
from bs4 import BeautifulSoup
from subprocess import run, PIPE
from urllib.parse import urlparse
import dashtable

def html_to_rst(input):
    return run(['pandoc', '-f', 'html', '-t', 'rst'],
               input=input, stdout=PIPE, universal_newlines=True).stdout

def convert_toc(filename):
    with open(filename, encoding='utf8') as input:
        soup = BeautifulSoup(input, 'html5lib')
        body = soup.find('body')
        with open('./docs/contents.rst', 'w', encoding='utf-8') as output:
            for elm in body.contents:
                if elm.name == 'h2':
                    output.write(""".. toctree::
   :maxdepth: 1
   :caption: {0}
   :hidden:\n""".format(elm.get_text()))
                elif elm.name == 'ul':
                    output.write('\n')
                    for li in elm.contents:
                        if li.name == 'li':
                            a = li.find('a')
                            url = a['href']
                            if url == 'index.html':
                                output.write('   self\n')
                            elif bool(urlparse(url).netloc):
                                output.write('   {0} <{1}>\n'.format(a.get_text(), url))
                            else:
                                output.write('   {0}\n'.format(url[:-5]))
                    output.write('\n')
                elif elm.name == 'dl':
                    a = elm.find('a')
                    output.write('\n   {0} <{1}>\n'.format(a.get_text(), url))
                elif hasattr(elm, 'contents'):
                    print('**** UNKNOWN: ' + str(elm))
                    exit(1)
    print("SUCCESS: " + filename)

def convert_article(filename):
    with open(filename, encoding='utf8') as input:
        soup = BeautifulSoup(input, 'html5lib')

        table = None
        if filename == './docs/release-calendar.html':
            table = dashtable.html2rst(str(soup.table.extract()))

        content = soup.find('div', 'content')
        content = ''.join(map(str, content.contents))
        content = html_to_rst(str(content))

        if table:
            content = '\n'.join([content, table, ''])

        with open(os.path.splitext(filename)[0]+'.rst', 'w', encoding='utf-8') as output:
            output.write(str(content))
            if filename == './docs/relnotes.html':
                output.write("""\n.. toctree::
   :maxdepth: 1
   :hidden:\n""")
                output.write('\n')
                for li in soup.findAll('li'):
                    a = li.find('a')
                    url = a['href']
                    split = os.path.splitext(url)
                    if split[1] == '.html':
                        output.write('   {0}\n'.format(split[0]))
                output.write('   Older Versions <versions>\n')

    print("SUCCESS: " + filename)

for filename in glob.iglob('./docs/**/*.html', recursive=True):
    if filename == './docs/contents.html':
        convert_toc(filename)
    else:
        convert_article(filename)