"Character reader for parsing Hy source."
from itertools import islice
import hy
from hy.models import (
Bytes,
Complex,
Dict,
Expression,
FComponent,
Float,
FString,
Integer,
Keyword,
List,
Set,
String,
Symbol,
Tuple,
as_model,
)
from .exceptions import LexException, PrematureEndOfInput
from .mangling import mangle
from .reader import Reader, isnormalizedspace
def sym(name):
return Symbol(name, from_parser=True)
# Note: This is subtly different from
# the `mkexpr` in hy/compiler.py !
def mkexpr(root, *args):
return Expression((sym(root) if isinstance(root, str) else root, *args))
def as_identifier(ident, reader=None):
"""Generate a Hy model from an identifier.
Also verifies the syntax of dot notation and validity of symbol names.
Parameters
----------
ident : str
Text to convert.
reader : Reader, optional
The reader to use, if any; used for generating position data for errors.
Returns
-------
out : a hy.models.Object subtype corresponding to the parsed text.
"""
try:
return Integer(ident)
except ValueError:
pass
try:
return Float(ident)
except ValueError:
pass
if ident not in ("j", "J"):
try:
return Complex(ident)
except ValueError:
pass
if "." in ident:
if not ident.strip("."):
# It's all dots. Return it as a symbol.
return sym(ident)
def err(msg):
raise (
ValueError(msg)
if reader is None
else LexException.from_reader(msg, reader)
)
if ident.lstrip(".").find("..") > 0:
err(
"In a dotted identifier, multiple dots in a row are only allowed at the start"
)
if ident.endswith("."):
err("A dotted identifier can't end with a dot")
head = "." * (len(ident) - len(ident.lstrip(".")))
args = [as_identifier(a, reader=reader) for a in ident.lstrip(".").split(".")]
if any(not isinstance(a, Symbol) for a in args):
err("The parts of a dotted identifier must be symbols")
return (
mkexpr(sym("."), *args)
if head == ""
else mkexpr(head, Symbol("None"), *args)
)
if reader is None:
if (
not ident
or ident[0] in ":#"
or any(isnormalizedspace(c) for c in ident)
or HyReader.NON_IDENT.intersection(ident)
):
raise ValueError(f"Syntactically illegal symbol: {ident!r}")
return sym(ident)
[docs]class HyReader(Reader):
"""A modular reader for Hy source."""
###
# Components necessary for Reader implementation
###
NON_IDENT = set("()[]{};\"'`~")
def __init__(self):
super().__init__()
# move any reader macros declared using
# `reader_for("#...")` to the macro table
self.reader_macros = {}
for tag in list(self.reader_table.keys()):
if tag[0] == '#' and tag[1:]:
self.reader_macros[tag[1:]] = self.reader_table.pop(tag)
[docs] def fill_pos(self, model, start):
"""Attach line/col information to a model.
Sets the end location of `model` to the current cursor position.
Args:
model (hy.models.Object): model to set line/col info for.
start (tuple[int, int]): (line, column) tuple indicating the start
location to assign to `model`.
"""
model.start_line, model.start_column = start
model.end_line, model.end_column = self.pos
return model.replace(model)
# `replace` will recurse into submodels and set any model
# positions that are still unset the same way.
[docs] def read_default(self, key):
"""Default reader handler when nothing in the table matches.
Try to read an identifier. If there's a double-quote immediately
following, then instead parse it as a string with the given prefix (e.g.,
`r"..."`).
"""
ident = key + self.read_ident()
if self.peek_and_getc('"'):
return self.prefixed_string('"', ident)
return as_identifier(ident, reader=self)
[docs] def parse(self, stream, filename=None, skip_shebang=False):
"""Yields all `hy.models.Object`'s in `source`
Additionally exposes `self` as ``hy.&reader`` during read/compile time.
Args:
source:
Hy source to be parsed.
filename (str | None):
Filename to use for error messages. If `None` then previously
set filename is used.
skip_shebang:
Whether to detect a skip a shebang line at the start.
"""
self._set_source(stream, filename)
if skip_shebang and "".join(
islice(self.peeking(eof_ok = True), len("#!"))) == "#!":
for c in self.chars():
if c == "\n":
break
rname = mangle("&reader")
old_reader = getattr(hy, rname, None)
setattr(hy, rname, self)
try:
yield from self.parse_forms_until("")
finally:
if old_reader is None:
delattr(hy, rname)
else:
setattr(hy, rname, old_reader)
###
# Reading forms
###
def try_parse_one_form(self):
"""Attempt to parse a single Hy form.
Read one (non-space) character from the stream, then call the
corresponding handler.
Returns:
hy.models.Object | None:
Model optionally returned by the called handler. Handlers may
return `None` to signify no parsed form (e.g., for comments).
Raises:
PrematureEndOfInput: If the reader hits the end of the file before
fully parsing a form.
LexException: If there is an error during form parsing.
"""
try:
self.slurp_space()
c = self.getc()
start = self._pos
if not c:
raise PrematureEndOfInput.from_reader(
"Premature end of input while attempting to parse one form", self
)
handler = self.reader_table.get(c)
model = handler(self, c) if handler else self.read_default(c)
return self.fill_pos(model, start) if model is not None else None
except LexException:
raise
except Exception as e:
raise LexException.from_reader(
str(e) or "Exception thrown attempting to parse one form", self
)
###
# Basic atoms
###
@reader_for(")")
@reader_for("]")
@reader_for("}")
def INVALID(self, key):
raise LexException.from_reader(
f"Ran into a '{key}' where it wasn't expected.", self
)
@reader_for(";")
def line_comment(self, _):
any(c == "\n" for c in self.chars(eof_ok=True))
return None
@reader_for(":")
def keyword(self, _):
ident = self.read_ident()
if "." in ident:
raise LexException.from_reader(
"Cannot access attribute on anything other"
" than a name (in order to get attributes of expressions,"
" use `(. <expression> <attr>)` or `(.<attr> <expression>)`)",
self,
)
return Keyword(ident, from_parser=True)
@reader_for('"')
def prefixed_string(self, _, prefix=""):
prefix_chars = set(prefix)
if (
len(prefix_chars) != len(prefix)
or prefix_chars - set("bfr")
or set("bf") <= prefix_chars
):
raise LexException.from_reader(f"invalid string prefix {prefix!r}", self)
escaping = False
def quote_closing(c):
nonlocal escaping
if c == "\\":
escaping = not escaping
return 0
if c == '"' and not escaping:
return 1
if (
escaping
and "r" not in prefix
and
# https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
c
not in ("\n\r\\'\"abfnrtv01234567x" + ("" if "b" in prefix else "NuU"))
):
raise LexException.from_reader("invalid escape sequence \\" + c, self)
escaping = False
return 0
return self.read_string_until(quote_closing, prefix, "f" in prefix.lower())
###
# Special annotations
###
@reader_for("'", ("quote",))
@reader_for("`", ("quasiquote",))
def tag_as(root):
return lambda self, _: mkexpr(root, self.parse_one_form())
@reader_for("~")
def unquote(self, key):
return mkexpr(
"unquote" + ("-splice" if self.peek_and_getc("@") else ""),
self.parse_one_form(),
)
###
# Sequences
###
@reader_for("(", (Expression, ")"))
@reader_for("[", (List, "]"))
@reader_for("{", (Dict, "}"))
@reader_for("#{", (Set, "}"))
@reader_for("#(", (Tuple, ")"))
def sequence(seq_type, closer):
return lambda self, _: seq_type(self.parse_forms_until(closer))
###
# Reader tag-macros
###
@reader_for("#")
def tag_dispatch(self, key):
"""General handler for reader macros (and tag macros).
Reads a full identifier after the `#` and calls the corresponding handler
(this allows, e.g., `#reads-multiple-forms foo bar baz`).
"""
if not self.peekc().strip():
raise PrematureEndOfInput.from_reader(
"Premature end of input while attempting dispatch", self
)
# try dispatching tagged ident
ident = self.read_ident() or self.getc()
if ident in self.reader_macros:
tree = self.reader_macros[ident](self, ident)
return as_model(tree) if tree is not None else None
raise LexException.from_reader(
f"reader macro '{key + ident}' is not defined", self
)
@reader_for("#_")
def discard(self, _):
"""Discards the next parsed form."""
self.parse_one_form()
return None
@reader_for("#*")
@reader_for("#**")
def hash_star(self, stars):
"""Unpacking forms `#*` and `#**`, corresponding to `*` and `**` in Python."""
return mkexpr(
"unpack-" + {"*": "iterable", "**": "mapping"}[stars],
self.parse_one_form(),
)
@reader_for("#^")
def annotate(self, _):
"""Annotate a symbol, usually with a type."""
typ = self.parse_one_form()
target = self.parse_one_form()
return mkexpr("annotate", target, typ)
###
# Strings
# (these are more complicated because f-strings
# form their own sublanguage)
###
@reader_for("#[")
def bracketed_string(self, _):
"""Bracketed strings. See the Hy docs for full details."""
delim = []
for c in self.chars():
if c == "[":
break
elif c == "]":
raise LexException.from_reader(
"Ran into a ']' where it wasn't expected.", self
)
delim.append(c)
delim = "".join(delim)
is_fstring = delim == "f" or delim.startswith("f-")
# discard single initial newline, if any, accounting for all
# three styles of newline
self.peek_and_getc("\x0d")
self.peek_and_getc("\x0a")
index = -1
def delim_closing(c):
nonlocal index
if c == "]":
if index == len(delim):
# this is the second bracket at the end of the delim
return len(delim) + 2
else:
# reset state, this may be the first bracket of closing delim
index = 0
elif 0 <= index <= len(delim):
# we're inside a possible closing delim
if index < len(delim) and c == delim[index]:
index += 1
else:
# failed delim, reset state
index = -1
return 0
return self.read_string_until(delim_closing, "fr" if is_fstring else None, is_fstring, brackets=delim)
def read_string_until(self, closing, prefix, is_fstring, **kwargs):
if is_fstring:
components = self.read_fcomponents_until(closing, prefix)
return FString(components, **kwargs)
s = self.read_chars_until(closing, prefix, is_fstring=False)
return (Bytes if isinstance(s, bytes) else String)(s, **kwargs)
def read_chars_until(self, closing, prefix, is_fstring):
s = []
for c in self.chars():
s.append(c)
# check if c is closing
n_closing_chars = closing(c)
if n_closing_chars:
# string has ended
s = s[:-n_closing_chars]
break
# check if c is start of component
if is_fstring and c == "{" and s[-3:] != ["\\", "N", "{"]:
# check and handle "{{"
if self.peek_and_getc("{"):
s.append("{")
else:
# remove "{" from end of string component
s.pop()
break
res = "".join(s).replace("\x0d\x0a", "\x0a").replace("\x0d", "\x0a")
if prefix is not None:
res = eval(f'{prefix}"""{res}"""')
if is_fstring:
return res, n_closing_chars
return res
def read_fcomponents_until(self, closing, prefix):
components = []
start = self.pos
while True:
s, closed = self.read_chars_until(closing, prefix, is_fstring=True)
if s:
components.append(self.fill_pos(String(s), start))
if closed:
break
components.extend(self.read_fcomponent(prefix))
return components
def read_fcomponent(self, prefix):
"""May return one or two components, since the `=` debugging syntax
will create a String component."""
start = self.pos
values = []
conversion = None
has_debug = False
# read the expression, saving the text verbatim
# in case we encounter debug `=`
space_before = self.slurp_space()
with self.saving_chars() as form_text:
model = self.parse_one_form()
space_between = self.slurp_space()
# check for and handle debug syntax:
# we emt the verbatim text before we emit the value
if self.peek_and_getc("="):
has_debug = True
space_after = self.slurp_space()
dbg_prefix = (
space_before + "".join(form_text) + space_between + "=" + space_after
)
values.append(self.fill_pos(String(dbg_prefix), start))
# handle conversion code
if self.peek_and_getc("!"):
conversion = self.getc()
self.slurp_space()
def component_closing(c):
if c == "}":
return 1
return 0
# handle formatting options
format_components = []
if self.peek_and_getc(":"):
format_components = self.read_fcomponents_until(component_closing, prefix)
else:
if has_debug and conversion is None:
conversion = "r"
if not self.getc() == "}":
raise LexException.from_reader("f-string: trailing junk in field", self)
return values + [
self.fill_pos(FComponent((model, *format_components), conversion), start)
]