# Wikipedia: https://en.wikipedia.org/wiki/Unicode_subscripts_and_superscripts
import string
COMMANDS = {}
superscript_numbers = "⁰¹²³⁴⁵⁶⁷⁸⁹"
for i, ch in enumerate(superscript_numbers):
COMMANDS[f"^{{{i}}}"] = ch
subscript_numbers = "₀₁₂₃₄₅₆₇₈₉"
for i, ch in enumerate(subscript_numbers):
COMMANDS[f"_{{{i}}}"] = ch
superscript_lowercase = "ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖ𐞥ʳˢᵗᵘᵛʷˣʸᶻ"
superscript_uppercase = "ᴬᴮꟲᴰᴱꟳᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾꟴᴿ ᵀᵁⱽᵂ "
for latex, ch in zip(
string.ascii_letters, superscript_lowercase + superscript_uppercase
):
if ch != " ":
COMMANDS[f"^{{{latex}}}"] = ch
subscript_lowercase = "ₐ ₑ ₕᵢⱼₖₗₘₙₒₚ ᵣₛₜᵤᵥ ₓ "
for latex, ch in zip(string.ascii_letters, subscript_lowercase * 2):
if ch != " ":
COMMANDS[f"_{{{latex}}}"] = ch
greek_lowercase = (
r"\alpha",
r"\beta",
r"\gamma",
r"\delta",
r"\epsilon",
r"\zeta",
r"\eta",
r"\theta",
r"\iota",
r"\kappa",
r"\lambda",
r"\mu",
r"\nu",
r"\xi",
"o",
r"\pi",
r"\rho",
r"\sigma",
r"\tau",
r"\upsilon",
r"\phi",
r"\chi",
r"\psi",
r"\omega",
)
superscript_lowercase_greek = " ᵝᵞᵟᵋ ᶿᶥ ᶹᵠᵡ "
subscript_lowercase_greek = " ᵦᵧ ᵨ ᵩᵪ "
for latex, sup, sub in zip(
greek_lowercase, superscript_lowercase_greek, subscript_lowercase_greek
):
if sup != " ":
COMMANDS[f"^{{{latex}}}"] = sup
if sub != " ":
COMMANDS[f"_{{{latex}}}"] = subIt is possible to transform a subset of LaTeX to Unicode, as demonstrated by unicodeit website. Unfortunately, unicodeit only works on short LaTeX strings.
Here, I write a simple parser with Lark to process more complex LaTeX strings. A more complete version of this prototype is available as unicodeitplus.
# Symbols extracted from http://milde.users.sourceforge.net/LUCR/Math/data/unimathsymbols.txt, which is under Copyright 2011 by Günter Milde and licensed under the LaTeX Project Public License (LPPL)
from pathlib import Path
from urllib.request import urlopen
def match(comments):
matches = [
("PLUS", "+"),
("MINUS", "-"),
("EQUALS", "="),
("LEFT PARENTHESIS", "("),
("RIGHT PARENTHESIS", ")"),
]
for match, latex in matches:
if match in comments:
return latex
assert False, f"unmatched: {comments}" # never arrive here
with urlopen("http://milde.users.sourceforge.net/LUCR/Math/data/unimathsymbols.txt") as response:
body = response.read().decode()
for line in body.split("\n"):
if not line or line.startswith("#"):
continue
items = line.split("^")
_, ch, latex, latex2, clas, category, requirements, comments = items
comments = comments[:-1]
if latex:
if len(ch) > 1:
COMMANDS[latex] = ch[1]
else:
COMMANDS[latex] = ch
elif latex2:
COMMANDS[latex2] = ch
elif comments.startswith("SUPERSCRIPT"):
latex = f"^{{{match(comments)}}}"
COMMANDS[latex] = ch
elif comments.startswith("SUBSCRIPT"):
latex = f"_{{{match(comments)}}}"
COMMANDS[latex] = ch
else:
pass# enhancements
COMMANDS[r"\to"] = COMMANDS[r"\rightarrow"]
COMMANDS[r"^{\ast}"] = "*"
COMMANDS[r"\hbar"] = COMMANDS[r"\hslash"]
COMMANDS["h"] = "ℎ"Lark is awesome, because it generates the parser from an EMBL string, is fast and lightweight.
from lark import Lark
from lark import Tree
from lark.visitors import Transformer, Visitor, Discard, v_args, Interpreter
from copy import deepcopy
parser = Lark(r"""
start: (item | math)*
?atom: CHARACTER
| COMMAND
?item: atom
| WS+
| group
CHARACTER: /[^%#&\{\}^_]/ | ESCAPED
ESCAPED: "\\\\" | "\\#" | "\\%" | "\\&" | "\\{" | "\\}" | "\\_"
group: "{" item* "}"
math: "$" item* "$"
SUBSCRIPT: "_"
SUPERSCRIPT: "^"
COMMAND: (("\\" WORD WS*) | SUBSCRIPT | SUPERSCRIPT)
%import common.WS
%import common.WORD
""", parser="lalr")HAS_ARG = {
r"_",
r"^",
r"\grave",
r"\acute",
r"\hat",
r"\tilde",
r"\bar",
r"\overline",
r"\breve",
r"\dot",
r"\ddot",
r"\mathring",
r"\check",
r"\utilde",
r"\underbar",
r"\underline",
r"\not",
r"\lvec",
r"\vec",
r"\LVec",
r"\vec",
r"\dddot",
r"\ddddot",
r"\overleftrightarrow",
r"\underleftarrow",
r"\underrightarrow",
r"\mathbf",
r"\text",
r"\mathrm",
r"\left",
r"\right",
r"\big",
r"\Big",
r"\Bigg",
r"\sqrt",
}
IGNORE_AS_FALLBACK = {
r"\text",
r"\mathbf",
r"\mathrm",
r"\left",
r"\right",
r"\big",
r"\Big",
r"\Bigg",
}
ESCAPED = {
r"\}": "}",
r"\{": "{",
"\\\\": "\\",
}
def handle_cmd(state, x):
# - x can be a character or a command, like \alpha
# - state["command"] contains stack with commands, may be empty
# - to transform ^{\alpha} or \text{x} correctly, we first try to
# convert innermost command and x as a unit
# - they are treated independently only if previous step fails
cmd_stack = state["command"].copy()
if state["math"]:
cmd = cmd_stack[-1] if cmd_stack else ""
latex = f"{cmd}{{{x}}}"
if cmd and latex in COMMANDS:
x = COMMANDS[latex]
cmd_stack.pop()
elif x.startswith(r"\\"):
x = COMMANDS.get(x, x)
elif cmd in (r"\text", r"\mathrm"):
cmd_stack.pop()
else:
x = COMMANDS.get(x, x)
for cmd in reversed(cmd_stack):
if cmd in COMMANDS:
# must be some unicode modifier, e.g. \dot, \vec
assert cmd in HAS_ARG
x += COMMANDS[cmd]
else:
latex = f"{cmd}{{{x}}}"
if latex in COMMANDS:
x = COMMANDS[latex]
elif cmd not in IGNORE_AS_FALLBACK:
x = latex
else:
for cmd in reversed(state["command"]):
x = f"{cmd}{{{x}}}"
if state["command"] and not state["group"]:
state["command"].pop()
return x
def transform(ch, state=None):
if state is None:
state = {
"math": False,
"command": [],
"group": False,
}
if isinstance(ch, Tree):
r = []
undo_math = False
if ch.data == "math":
state["math"] = True
if ch.data == "group":
state["group"] = True
for x in ch.children:
r.append(transform(x, state))
if ch.data == "math":
state["math"] = False
if ch.data == "group":
state["group"] = False
if state["command"]:
state["command"].clear()
return "".join(r)
if ch.type == "CHARACTER":
x = ESCAPED.get(ch.value, ch.value)
return handle_cmd(state, x)
if ch.type == "WS":
return "" if state["math"] else " "
if ch.type == "COMMAND":
x = ch.value.strip()
if x in HAS_ARG:
if x == r"\sqrt":
state["command"].append(r"\overline")
return COMMANDS[r"\sqrt"]
state["command"].append(x)
return ""
return handle_cmd(state, x)
# never arrive here
assert False, f"unknown token {ch}"
def parse(s):
tree = parser.parse(s)
return transform(tree)Let’s see how well (or not) this works on a few examples.
s = r"foo?!-1+2. \} \\ $\left(\mathbf{\alpha + 1}^2_x y\right)$ bar $\beta^{12}$ $\bar p {}^foo$ $\bar \mathrm{t}$ "
print(s)
parse(s)foo?!-1+2. \} \\ $\left(\mathbf{\alpha + 1}^2_x y\right)$ bar $\beta^{12}$ $\bar p {}^foo$ $\bar \mathrm{t}$
'foo?!-1+2. } \\ (𝛂+𝟏²ₓ𝑦) bar 𝛽¹² 𝑝̄ᶠ𝑜𝑜 t̄ '
parse(r"$D^{\ast\ast} \to hhee$")'𝐷**→ℎℎ𝑒𝑒'
parse(r"$\mathbf{xyz + 1}$")'𝐱𝐲𝐳+𝟏'
parse(r"$\sqrt {1Aas\alpha}$")'√1̅𝐴̅𝑎̅𝑠̅𝛼̅'
parse(r"$\vec{x} b^2 \vec\alpha\overline\alpha K^0_S p_\text{T} \text T$")'𝑥⃗𝑏²𝛼⃗𝛼̅𝐾⁰ₛ𝑝ₜT'
parse(r"$\sqrt{abcd}$")'√𝑎̅𝑏̅𝑐̅𝑑̅'
parse(r"$p_T / \text{GeV}c^{-1}$")'𝑝ₜ/GeV𝑐⁻¹'
parse(r"Search for resonant $ \mathrm{t}\overline{\mathrm{t}} $ production in proton-proton collisions at $ \sqrt{s}=13 $ TeV")'Search for resonant tt̅ production in proton-proton collisions at √𝑠̅=13 TeV'