Transform LaTeX to Unicode

Published

April 10, 2023

It is possible to transform a subset of LaTeX to Unicode, as demonstrated by unicodeit website. Unfortunately, unicodeit only works on short LaTeX strings.

Here, I write a simple parser with Lark to process more complex LaTeX strings. A more complete version of this prototype is available as unicodeitplus.

# Wikipedia: https://en.wikipedia.org/wiki/Unicode_subscripts_and_superscripts
import string

COMMANDS = {}

superscript_numbers = "⁰¹²³⁴⁵⁶⁷⁸⁹"
for i, ch in enumerate(superscript_numbers):
    COMMANDS[f"^{{{i}}}"] = ch

subscript_numbers = "₀₁₂₃₄₅₆₇₈₉"
for i, ch in enumerate(subscript_numbers):
    COMMANDS[f"_{{{i}}}"] = ch

superscript_lowercase = "ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖ𐞥ʳˢᵗᵘᵛʷˣʸᶻ"
superscript_uppercase = "ᴬᴮꟲᴰᴱꟳᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾꟴᴿ ᵀᵁⱽᵂ   "
for latex, ch in zip(
    string.ascii_letters, superscript_lowercase + superscript_uppercase
):
    if ch != " ":
        COMMANDS[f"^{{{latex}}}"] = ch

subscript_lowercase = "ₐ   ₑ  ₕᵢⱼₖₗₘₙₒₚ ᵣₛₜᵤᵥ ₓ  "
for latex, ch in zip(string.ascii_letters, subscript_lowercase * 2):
    if ch != " ":
        COMMANDS[f"_{{{latex}}}"] = ch

greek_lowercase = (
    r"\alpha",
    r"\beta",
    r"\gamma",
    r"\delta",
    r"\epsilon",
    r"\zeta",
    r"\eta",
    r"\theta",
    r"\iota",
    r"\kappa",
    r"\lambda",
    r"\mu",
    r"\nu",
    r"\xi",
    "o",
    r"\pi",
    r"\rho",
    r"\sigma",
    r"\tau",
    r"\upsilon",
    r"\phi",
    r"\chi",
    r"\psi",
    r"\omega",
)

superscript_lowercase_greek = " ᵝᵞᵟᵋ  ᶿᶥ          ᶹᵠᵡ  "
subscript_lowercase_greek = " ᵦᵧ             ᵨ   ᵩᵪ  "
for latex, sup, sub in zip(
    greek_lowercase, superscript_lowercase_greek, subscript_lowercase_greek
):
    if sup != " ":
        COMMANDS[f"^{{{latex}}}"] = sup
    if sub != " ":
        COMMANDS[f"_{{{latex}}}"] = sub
# Symbols extracted from http://milde.users.sourceforge.net/LUCR/Math/data/unimathsymbols.txt, which is under Copyright 2011 by Günter Milde and licensed under the LaTeX Project Public License (LPPL)

from pathlib import Path
from urllib.request import urlopen


def match(comments):
    matches = [
        ("PLUS", "+"),
        ("MINUS", "-"),
        ("EQUALS", "="),
        ("LEFT PARENTHESIS", "("),
        ("RIGHT PARENTHESIS", ")"),
    ]
    for match, latex in matches:
        if match in comments:
            return latex
    assert False, f"unmatched: {comments}"  # never arrive here


with urlopen("http://milde.users.sourceforge.net/LUCR/Math/data/unimathsymbols.txt") as response:
    body = response.read().decode()
    for line in body.split("\n"):
        if not line or line.startswith("#"):
            continue
        items = line.split("^")
        _, ch, latex, latex2, clas, category, requirements, comments = items
        comments = comments[:-1]
        if latex:
            if len(ch) > 1:
                COMMANDS[latex] = ch[1]
            else:
                COMMANDS[latex] = ch
        elif latex2:
            COMMANDS[latex2] = ch
        elif comments.startswith("SUPERSCRIPT"):
            latex = f"^{{{match(comments)}}}"
            COMMANDS[latex] = ch
        elif comments.startswith("SUBSCRIPT"):
            latex = f"_{{{match(comments)}}}"
            COMMANDS[latex] = ch
        else:
            pass
# enhancements
COMMANDS[r"\to"] = COMMANDS[r"\rightarrow"]
COMMANDS[r"^{\ast}"] = "*"
COMMANDS[r"\hbar"] = COMMANDS[r"\hslash"]
COMMANDS["h"] = "ℎ"

Lark is awesome, because it generates the parser from an EMBL string, is fast and lightweight.

from lark import Lark
from lark import Tree
from lark.visitors import Transformer, Visitor, Discard, v_args, Interpreter
from copy import deepcopy

parser = Lark(r"""
start: (item | math)*

?atom: CHARACTER
    | COMMAND

?item: atom
    | WS+
    | group

CHARACTER: /[^%#&\{\}^_]/ | ESCAPED
ESCAPED: "\\\\" | "\\#" | "\\%" | "\\&"  | "\\{" | "\\}" | "\\_"
group: "{" item* "}"
math: "$" item* "$"
SUBSCRIPT: "_"
SUPERSCRIPT: "^"
COMMAND: (("\\" WORD WS*) | SUBSCRIPT | SUPERSCRIPT)

%import common.WS
%import common.WORD
""", parser="lalr")
HAS_ARG = {
    r"_",
    r"^",
    r"\grave",
    r"\acute",
    r"\hat",
    r"\tilde",
    r"\bar",
    r"\overline",
    r"\breve",
    r"\dot",
    r"\ddot",
    r"\mathring",
    r"\check",
    r"\utilde",
    r"\underbar",
    r"\underline",
    r"\not",
    r"\lvec",
    r"\vec",
    r"\LVec",
    r"\vec",
    r"\dddot",
    r"\ddddot",
    r"\overleftrightarrow",
    r"\underleftarrow",
    r"\underrightarrow",
    r"\mathbf",
    r"\text",
    r"\mathrm",
    r"\left",
    r"\right",
    r"\big",
    r"\Big",
    r"\Bigg",
    r"\sqrt",
}

IGNORE_AS_FALLBACK = {
    r"\text",
    r"\mathbf",
    r"\mathrm",
    r"\left",
    r"\right",
    r"\big",
    r"\Big",
    r"\Bigg",
}

ESCAPED = {
    r"\}": "}",
    r"\{": "{",
    "\\\\": "\\",
}


def handle_cmd(state, x):
    # - x can be a character or a command, like \alpha
    # - state["command"] contains stack with commands, may be empty
    # - to transform ^{\alpha} or \text{x} correctly, we first try to 
    #   convert innermost command and x as a unit
    # - they are treated independently only if previous step fails
    cmd_stack = state["command"].copy()
    if state["math"]:
        cmd = cmd_stack[-1] if cmd_stack else ""
        latex = f"{cmd}{{{x}}}"
        if cmd and latex in COMMANDS:
            x = COMMANDS[latex]
            cmd_stack.pop()
        elif x.startswith(r"\\"):
            x = COMMANDS.get(x, x)
        elif cmd in (r"\text", r"\mathrm"):
            cmd_stack.pop()
        else:
            x = COMMANDS.get(x, x)
        for cmd in reversed(cmd_stack):
            if cmd in COMMANDS:
                # must be some unicode modifier, e.g. \dot, \vec
                assert cmd in HAS_ARG
                x += COMMANDS[cmd]
            else:
                latex = f"{cmd}{{{x}}}"
                if latex in COMMANDS:
                    x = COMMANDS[latex]
                elif cmd not in IGNORE_AS_FALLBACK:
                    x = latex
    else:
        for cmd in reversed(state["command"]):
            x = f"{cmd}{{{x}}}"
    if state["command"] and not state["group"]:
        state["command"].pop()
    return x


def transform(ch, state=None):
    if state is None:
        state = {
            "math": False,
            "command": [],
            "group": False,
        }

    if isinstance(ch, Tree):
        r = []
        undo_math = False
        if ch.data == "math":
            state["math"] = True
        if ch.data == "group":
            state["group"] = True
        for x in ch.children:
            r.append(transform(x, state))
        if ch.data == "math":
            state["math"] = False
        if ch.data == "group":
            state["group"] = False
            if state["command"]:
                state["command"].clear()
        return "".join(r)

    if ch.type == "CHARACTER":
        x = ESCAPED.get(ch.value, ch.value)
        return handle_cmd(state, x)
    if ch.type == "WS":
        return "" if state["math"] else " "
    if ch.type == "COMMAND":
        x = ch.value.strip()
        if x in HAS_ARG:
            if x == r"\sqrt":
                state["command"].append(r"\overline")
                return COMMANDS[r"\sqrt"]
            state["command"].append(x)
            return ""
        return handle_cmd(state, x)
    # never arrive here
    assert False, f"unknown token {ch}" 


def parse(s):
    tree = parser.parse(s)
    return transform(tree)

Let’s see how well (or not) this works on a few examples.

s = r"foo?!-1+2. \}  \\ $\left(\mathbf{\alpha + 1}^2_x y\right)$ bar $\beta^{12}$ $\bar p {}^foo$ $\bar \mathrm{t}$ "
print(s)
parse(s)
foo?!-1+2. \}  \\ $\left(\mathbf{\alpha + 1}^2_x y\right)$ bar $\beta^{12}$ $\bar p {}^foo$ $\bar \mathrm{t}$ 
'foo?!-1+2. } \\ (𝛂+𝟏²ₓ𝑦) bar 𝛽¹² 𝑝̄ᶠ𝑜𝑜 t̄ '
parse(r"$D^{\ast\ast} \to hhee$")
'𝐷**→ℎℎ𝑒𝑒'
parse(r"$\mathbf{xyz + 1}$")
'𝐱𝐲𝐳+𝟏'
parse(r"$\sqrt {1Aas\alpha}$")
'√1̅𝐴̅𝑎̅𝑠̅𝛼̅'
parse(r"$\vec{x} b^2 \vec\alpha\overline\alpha K^0_S p_\text{T} \text T$")
'𝑥⃗𝑏²𝛼⃗𝛼̅𝐾⁰ₛ𝑝ₜT'
parse(r"$\sqrt{abcd}$")
'√𝑎̅𝑏̅𝑐̅𝑑̅'
parse(r"$p_T / \text{GeV}c^{-1}$")
'𝑝ₜ/GeV𝑐⁻¹'
parse(r"Search for resonant $ \mathrm{t}\overline{\mathrm{t}} $ production in proton-proton collisions at $ \sqrt{s}=13 $ TeV")
'Search for resonant tt̅ production in proton-proton collisions at √𝑠̅=13 TeV'