# Wikipedia: https://en.wikipedia.org/wiki/Unicode_subscripts_and_superscripts
import string
= {}
COMMANDS
= "⁰¹²³⁴⁵⁶⁷⁸⁹"
superscript_numbers for i, ch in enumerate(superscript_numbers):
f"^{{{i}}}"] = ch
COMMANDS[
= "₀₁₂₃₄₅₆₇₈₉"
subscript_numbers for i, ch in enumerate(subscript_numbers):
f"_{{{i}}}"] = ch
COMMANDS[
= "ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖ𐞥ʳˢᵗᵘᵛʷˣʸᶻ"
superscript_lowercase = "ᴬᴮꟲᴰᴱꟳᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾꟴᴿ ᵀᵁⱽᵂ "
superscript_uppercase for latex, ch in zip(
+ superscript_uppercase
string.ascii_letters, superscript_lowercase
):if ch != " ":
f"^{{{latex}}}"] = ch
COMMANDS[
= "ₐ ₑ ₕᵢⱼₖₗₘₙₒₚ ᵣₛₜᵤᵥ ₓ "
subscript_lowercase for latex, ch in zip(string.ascii_letters, subscript_lowercase * 2):
if ch != " ":
f"_{{{latex}}}"] = ch
COMMANDS[
= (
greek_lowercase r"\alpha",
r"\beta",
r"\gamma",
r"\delta",
r"\epsilon",
r"\zeta",
r"\eta",
r"\theta",
r"\iota",
r"\kappa",
r"\lambda",
r"\mu",
r"\nu",
r"\xi",
"o",
r"\pi",
r"\rho",
r"\sigma",
r"\tau",
r"\upsilon",
r"\phi",
r"\chi",
r"\psi",
r"\omega",
)
= " ᵝᵞᵟᵋ ᶿᶥ ᶹᵠᵡ "
superscript_lowercase_greek = " ᵦᵧ ᵨ ᵩᵪ "
subscript_lowercase_greek for latex, sup, sub in zip(
greek_lowercase, superscript_lowercase_greek, subscript_lowercase_greek
):if sup != " ":
f"^{{{latex}}}"] = sup
COMMANDS[if sub != " ":
f"_{{{latex}}}"] = sub COMMANDS[
It is possible to transform a subset of LaTeX to Unicode, as demonstrated by unicodeit website. Unfortunately, unicodeit only works on short LaTeX strings.
Here, I write a simple parser with Lark to process more complex LaTeX strings. A more complete version of this prototype is available as unicodeitplus.
# Symbols extracted from http://milde.users.sourceforge.net/LUCR/Math/data/unimathsymbols.txt, which is under Copyright 2011 by Günter Milde and licensed under the LaTeX Project Public License (LPPL)
from pathlib import Path
from urllib.request import urlopen
def match(comments):
= [
matches "PLUS", "+"),
("MINUS", "-"),
("EQUALS", "="),
("LEFT PARENTHESIS", "("),
("RIGHT PARENTHESIS", ")"),
(
]for match, latex in matches:
if match in comments:
return latex
assert False, f"unmatched: {comments}" # never arrive here
with urlopen("http://milde.users.sourceforge.net/LUCR/Math/data/unimathsymbols.txt") as response:
= response.read().decode()
body for line in body.split("\n"):
if not line or line.startswith("#"):
continue
= line.split("^")
items = items
_, ch, latex, latex2, clas, category, requirements, comments = comments[:-1]
comments if latex:
if len(ch) > 1:
= ch[1]
COMMANDS[latex] else:
= ch
COMMANDS[latex] elif latex2:
= ch
COMMANDS[latex2] elif comments.startswith("SUPERSCRIPT"):
= f"^{{{match(comments)}}}"
latex = ch
COMMANDS[latex] elif comments.startswith("SUBSCRIPT"):
= f"_{{{match(comments)}}}"
latex = ch
COMMANDS[latex] else:
pass
# enhancements
r"\to"] = COMMANDS[r"\rightarrow"]
COMMANDS[r"^{\ast}"] = "*"
COMMANDS[r"\hbar"] = COMMANDS[r"\hslash"]
COMMANDS["h"] = "ℎ" COMMANDS[
Lark is awesome, because it generates the parser from an EMBL string, is fast and lightweight.
from lark import Lark
from lark import Tree
from lark.visitors import Transformer, Visitor, Discard, v_args, Interpreter
from copy import deepcopy
= Lark(r"""
parser start: (item | math)*
?atom: CHARACTER
| COMMAND
?item: atom
| WS+
| group
CHARACTER: /[^%#&\{\}^_]/ | ESCAPED
ESCAPED: "\\\\" | "\\#" | "\\%" | "\\&" | "\\{" | "\\}" | "\\_"
group: "{" item* "}"
math: "$" item* "$"
SUBSCRIPT: "_"
SUPERSCRIPT: "^"
COMMAND: (("\\" WORD WS*) | SUBSCRIPT | SUPERSCRIPT)
%import common.WS
%import common.WORD
""", parser="lalr")
= {
HAS_ARG r"_",
r"^",
r"\grave",
r"\acute",
r"\hat",
r"\tilde",
r"\bar",
r"\overline",
r"\breve",
r"\dot",
r"\ddot",
r"\mathring",
r"\check",
r"\utilde",
r"\underbar",
r"\underline",
r"\not",
r"\lvec",
r"\vec",
r"\LVec",
r"\vec",
r"\dddot",
r"\ddddot",
r"\overleftrightarrow",
r"\underleftarrow",
r"\underrightarrow",
r"\mathbf",
r"\text",
r"\mathrm",
r"\left",
r"\right",
r"\big",
r"\Big",
r"\Bigg",
r"\sqrt",
}
= {
IGNORE_AS_FALLBACK r"\text",
r"\mathbf",
r"\mathrm",
r"\left",
r"\right",
r"\big",
r"\Big",
r"\Bigg",
}
= {
ESCAPED r"\}": "}",
r"\{": "{",
"\\\\": "\\",
}
def handle_cmd(state, x):
# - x can be a character or a command, like \alpha
# - state["command"] contains stack with commands, may be empty
# - to transform ^{\alpha} or \text{x} correctly, we first try to
# convert innermost command and x as a unit
# - they are treated independently only if previous step fails
= state["command"].copy()
cmd_stack if state["math"]:
= cmd_stack[-1] if cmd_stack else ""
cmd = f"{cmd}{{{x}}}"
latex if cmd and latex in COMMANDS:
= COMMANDS[latex]
x
cmd_stack.pop()elif x.startswith(r"\\"):
= COMMANDS.get(x, x)
x elif cmd in (r"\text", r"\mathrm"):
cmd_stack.pop()else:
= COMMANDS.get(x, x)
x for cmd in reversed(cmd_stack):
if cmd in COMMANDS:
# must be some unicode modifier, e.g. \dot, \vec
assert cmd in HAS_ARG
+= COMMANDS[cmd]
x else:
= f"{cmd}{{{x}}}"
latex if latex in COMMANDS:
= COMMANDS[latex]
x elif cmd not in IGNORE_AS_FALLBACK:
= latex
x else:
for cmd in reversed(state["command"]):
= f"{cmd}{{{x}}}"
x if state["command"] and not state["group"]:
"command"].pop()
state[return x
def transform(ch, state=None):
if state is None:
= {
state "math": False,
"command": [],
"group": False,
}
if isinstance(ch, Tree):
= []
r = False
undo_math if ch.data == "math":
"math"] = True
state[if ch.data == "group":
"group"] = True
state[for x in ch.children:
r.append(transform(x, state))if ch.data == "math":
"math"] = False
state[if ch.data == "group":
"group"] = False
state[if state["command"]:
"command"].clear()
state[return "".join(r)
if ch.type == "CHARACTER":
= ESCAPED.get(ch.value, ch.value)
x return handle_cmd(state, x)
if ch.type == "WS":
return "" if state["math"] else " "
if ch.type == "COMMAND":
= ch.value.strip()
x if x in HAS_ARG:
if x == r"\sqrt":
"command"].append(r"\overline")
state[return COMMANDS[r"\sqrt"]
"command"].append(x)
state[return ""
return handle_cmd(state, x)
# never arrive here
assert False, f"unknown token {ch}"
def parse(s):
= parser.parse(s)
tree return transform(tree)
Let’s see how well (or not) this works on a few examples.
= r"foo?!-1+2. \} \\ $\left(\mathbf{\alpha + 1}^2_x y\right)$ bar $\beta^{12}$ $\bar p {}^foo$ $\bar \mathrm{t}$ "
s print(s)
parse(s)
foo?!-1+2. \} \\ $\left(\mathbf{\alpha + 1}^2_x y\right)$ bar $\beta^{12}$ $\bar p {}^foo$ $\bar \mathrm{t}$
'foo?!-1+2. } \\ (𝛂+𝟏²ₓ𝑦) bar 𝛽¹² 𝑝̄ᶠ𝑜𝑜 t̄ '
r"$D^{\ast\ast} \to hhee$") parse(
'𝐷**→ℎℎ𝑒𝑒'
r"$\mathbf{xyz + 1}$") parse(
'𝐱𝐲𝐳+𝟏'
r"$\sqrt {1Aas\alpha}$") parse(
'√1̅𝐴̅𝑎̅𝑠̅𝛼̅'
r"$\vec{x} b^2 \vec\alpha\overline\alpha K^0_S p_\text{T} \text T$") parse(
'𝑥⃗𝑏²𝛼⃗𝛼̅𝐾⁰ₛ𝑝ₜT'
r"$\sqrt{abcd}$") parse(
'√𝑎̅𝑏̅𝑐̅𝑑̅'
r"$p_T / \text{GeV}c^{-1}$") parse(
'𝑝ₜ/GeV𝑐⁻¹'
r"Search for resonant $ \mathrm{t}\overline{\mathrm{t}} $ production in proton-proton collisions at $ \sqrt{s}=13 $ TeV") parse(
'Search for resonant tt̅ production in proton-proton collisions at √𝑠̅=13 TeV'