from __future__ import annotations
from collections.abc import Iterable, Iterator
import re
from typing import AnyStr, IO
from warnings import warn
[docs]def read_preceded(
fp: IO[AnyStr],
sep: AnyStr | re.Pattern[AnyStr],
retain: bool = False,
chunk_size: int = 512,
) -> Iterator[AnyStr]:
"""
Read segments from a file-like object ``fp`` in which the beginning of each
segment is indicated by the string or compiled regex ``sep``. A generator
of segments is returned; an empty file will always produce an empty
generator.
Data is read from the filehandle ``chunk_size`` characters at a time. If
``sep`` is a variable-length compiled regex and a separator in the file
crosses a chunk boundary, the results are undefined.
.. deprecated:: 0.4.0
Passing a regular expression as a separator is deprecated, and support
will be removed in version 1.0.
:param fp: a binary or text file-like object
:param sep: a string or compiled regex that indicates the start of a new
segment wherever it occurs
:param bool retain: whether to include the separators at the beginning of
each segment
:param int chunk_size: how many bytes or characters to read from ``fp`` at
a time
:return: a generator of the segments in ``fp``
:rtype: generator of binary or text strings
"""
# Omits empty leading entry
entries = read_separated(fp, sep, retain=retain, chunk_size=chunk_size)
e = next(entries)
if e:
yield e
if retain:
entries = _join_pairs(entries)
for e in entries:
yield e
[docs]def read_separated(
fp: IO[AnyStr],
sep: AnyStr | re.Pattern[AnyStr],
retain: bool = False,
chunk_size: int = 512,
) -> Iterator[AnyStr]:
"""
Read segments from a file-like object ``fp`` in which segments are
separated by the string or compiled regex ``sep``. A generator of segments
is returned; an empty file will always produce a generator with one
element, the empty string.
Data is read from the filehandle ``chunk_size`` characters at a time. If
``sep`` is a variable-length compiled regex and a separator in the file
crosses a chunk boundary, the results are undefined.
.. deprecated:: 0.4.0
Passing a regular expression as a separator is deprecated, and support
will be removed in version 1.0.
:param fp: a binary or text file-like object
:param sep: a string or compiled regex that indicates the end of one
segment and the beginning of another wherever it occurs
:param bool retain: When `True`, the segment separators will be included in
the output, with the elements of the generator alternating between
segments and separators, starting with a (possibly empty) segment
:param int chunk_size: how many bytes or characters to read from ``fp`` at
a time
:return: a generator of the segments in ``fp``
:rtype: generator of binary or text strings
"""
if not isinstance(sep, (bytes, str)):
warn(
"Passing a regular expression separator to a read_*() function is"
" deprecated and will be removed in v1.0",
DeprecationWarning,
)
seppattern = _ensure_compiled(sep)
empty = fp.read(0) # b'' or u'' as appropriate
buff = empty
for chunk in iter(lambda: fp.read(chunk_size), empty):
buff += chunk
lastend = 0
for m in seppattern.finditer(buff):
yield buff[lastend : m.start()]
if retain:
yield m.group()
lastend = m.end()
buff = buff[lastend:]
yield buff
[docs]def read_terminated(
fp: IO[AnyStr],
sep: AnyStr | re.Pattern[AnyStr],
retain: bool = False,
chunk_size: int = 512,
) -> Iterator[AnyStr]:
"""
Read segments from a file-like object ``fp`` in which the end of each
segment is indicated by the string or compiled regex ``sep``. A generator
of segments is returned; an empty file will always produce an empty
generator.
Data is read from the filehandle ``chunk_size`` characters at a time. If
``sep`` is a variable-length compiled regex and a separator in the file
crosses a chunk boundary, the results are undefined.
.. deprecated:: 0.4.0
Passing a regular expression as a separator is deprecated, and support
will be removed in version 1.0.
:param fp: a binary or text file-like object
:param sep: a string or compiled regex that indicates the end of a segment
wherever it occurs
:param bool retain: whether to include the separators at the end of each
segment
:param int chunk_size: how many bytes or characters to read from ``fp`` at
a time
:return: a generator of the segments in ``fp``
:rtype: generator of binary or text strings
"""
# Omits empty trailing entry
prev = None
entries = read_separated(fp, sep, retain=retain, chunk_size=chunk_size)
if retain:
entries = _join_pairs(entries)
for e in entries:
if prev is not None:
yield prev
prev = e
if prev:
yield prev
[docs]def split_preceded(
s: AnyStr,
sep: AnyStr | re.Pattern[AnyStr],
retain: bool = False,
) -> list[AnyStr]:
"""
Split a string ``s`` into zero or more segments starting with/preceded by
the string or compiled regex ``sep``. A list of segments is returned; an
empty input string will always produce an empty list.
:param s: a binary or text string
:param sep: a string or compiled regex that indicates the start of a new
segment wherever it occurs
:param bool retain: whether to include the separators at the beginning of
each segment
:return: a list of the segments in ``s``
:rtype: list of binary or text strings
"""
entries = split_separated(s, sep, retain)
if retain:
entries[1:] = list(_join_pairs(entries[1:]))
if not entries[0]:
entries.pop(0)
return entries
[docs]def split_separated(
s: AnyStr,
sep: AnyStr | re.Pattern[AnyStr],
retain: bool = False,
) -> list[AnyStr]:
"""
Split a string ``s`` into one or more segments separated by the string or
compiled regex ``sep``. A list of segments is returned; an empty input
string will always produce a list with one element, the empty string.
:param s: a binary or text string
:param sep: a string or compiled regex that indicates the end of one
segment and the beginning of another wherever it occurs
:param bool retain: When `True`, the segment separators will be included in
the output, with the elements of the list alternating between segments
and separators, starting with a (possibly empty) segment
:return: a list of the segments in ``s``
:rtype: list of binary or text strings
"""
seppattern = _ensure_compiled(sep)
entries = []
lastend = 0
for m in seppattern.finditer(s):
entries.append(s[lastend : m.start()])
if retain:
entries.append(m.group())
lastend = m.end()
entries.append(s[lastend:])
return entries
[docs]def split_terminated(
s: AnyStr,
sep: AnyStr | re.Pattern[AnyStr],
retain: bool = False,
) -> list[AnyStr]:
"""
Split a string ``s`` into zero or more segments terminated by the
string or compiled regex ``sep``. A list of segments is returned; an empty
input string will always produce an empty list.
:param s: a binary or text string
:param sep: a string or compiled regex that indicates the end of a segment
wherever it occurs
:param bool retain: whether to include the separators at the end of each
segment
:return: a list of the segments in ``s``
:rtype: list of binary or text strings
"""
entries = split_separated(s, sep, retain)
if retain:
entries = list(_join_pairs(entries))
if not entries[-1]:
entries.pop()
return entries
[docs]def join_preceded(iterable: Iterable[AnyStr], sep: AnyStr) -> AnyStr:
"""
Join the elements of ``iterable`` together, preceding each one with ``sep``
:param iterable: an iterable of binary or text strings
:param sep: a binary or text string
:rtype: a binary or text string
"""
return sep[0:0].join(sep + s for s in iterable)
[docs]def join_separated(iterable: Iterable[AnyStr], sep: AnyStr) -> AnyStr:
"""
Join the elements of ``iterable`` together, separating consecutive elements
with ``sep``
:param iterable: an iterable of binary or text strings
:param sep: a binary or text string
:rtype: a binary or text string
"""
return sep.join(iterable)
[docs]def join_terminated(iterable: Iterable[AnyStr], sep: AnyStr) -> AnyStr:
"""
Join the elements of ``iterable`` together, appending ``sep`` to each one
:param iterable: an iterable of binary or text strings
:param sep: a binary or text string
:rtype: a binary or text string
"""
return sep[0:0].join(s + sep for s in iterable)
[docs]def write_preceded(fp: IO[AnyStr], iterable: Iterable[AnyStr], sep: AnyStr) -> None:
"""
Write the elements of ``iterable`` to the filehandle ``fp``, preceding each
one with ``sep``
:param fp: a binary or text file-like object
:param iterable: an iterable of binary or text strings
:param sep: a binary or text string
:return: `None`
"""
for s in iterable:
fp.write(sep)
fp.write(s)
[docs]def write_separated(fp: IO[AnyStr], iterable: Iterable[AnyStr], sep: AnyStr) -> None:
"""
Write the elements of ``iterable`` to the filehandle ``fp``, separating
consecutive elements with ``sep``
:param fp: a binary or text file-like object
:param iterable: an iterable of binary or text strings
:param sep: a binary or text string
:return: `None`
"""
first = True
for s in iterable:
if first:
first = False
else:
fp.write(sep)
fp.write(s)
[docs]def write_terminated(fp: IO[AnyStr], iterable: Iterable[AnyStr], sep: AnyStr) -> None:
"""
Write the elements of ``iterable`` to the filehandle ``fp``, appending
``sep`` to each one
:param fp: a binary or text file-like object
:param iterable: an iterable of binary or text strings
:param sep: a binary or text string
:return: `None`
"""
for s in iterable:
fp.write(s)
fp.write(sep)
def _join_pairs(iterable: Iterable[AnyStr]) -> Iterator[AnyStr]:
"""
Concatenate each pair of consecutive strings in ``iterable``. If there are
an odd number of items in ``iterable``, the last one will be returned
unmodified.
"""
i = iter(iterable)
for a in i:
try:
b = next(i)
except StopIteration:
yield a
return
else:
yield a + b
def _ensure_compiled(sep: AnyStr | re.Pattern[AnyStr]) -> re.Pattern[AnyStr]:
if isinstance(sep, (bytes, str)):
return re.compile(re.escape(sep))
else:
return sep
_EOL_RGX = re.compile(r"\r\n?|\n")
[docs]def ascii_splitlines(s: str, keepends: bool = False) -> list[str]:
"""
.. versionadded:: 0.3.0
Like `str.splitlines()`, except it only treats ``"\\n"``, ``"\\r\\n"``, and
``"\\r"`` as line endings
"""
lines = []
lastend = 0
for m in _EOL_RGX.finditer(s):
if keepends:
lines.append(s[lastend : m.end()])
else:
lines.append(s[lastend : m.start()])
lastend = m.end()
if lastend < len(s):
lines.append(s[lastend:])
return lines
[docs]def read_paragraphs(fp: Iterable[str]) -> Iterator[str]:
"""
.. versionadded:: 0.3.0
Read a text filehandle or other iterable of lines (with trailing line
endings retained) paragraph by paragraph. Each paragraph is terminated by
one or more blank lines (i.e., lines containing only a line ending).
Trailing and embedded line endings in each paragraph are retained.
Only ``"\\n"``, ``"\\r\\n"``, and ``"\\r"`` are recognized as line endings.
"""
para: list[str] = []
for line in fp:
if not _is_blank(line) and para and _is_blank(para[-1]):
yield "".join(para)
para = [line]
else:
para.append(line)
if para:
yield "".join(para)
def _is_blank(line: str) -> bool:
return line in ("\n", "\r", "\r\n")
_EOL_RGX2 = r"(?:\r\n|\r(?!\n)|\n)"
_EOPARA_RGX = re.compile(rf"\A{_EOL_RGX2}+|{_EOL_RGX2}{{2,}}")
[docs]def split_paragraphs(s: str) -> list[str]:
"""
.. versionadded:: 0.3.0
Split a string into paragraphs, each one terminated by one or more blank
lines (i.e., lines containing only a line ending). Trailing and embedded
line endings in each paragraph are retained.
Only ``"\\n"``, ``"\\r\\n"``, and ``"\\r"`` are recognized as line endings.
"""
return split_terminated(s, _EOPARA_RGX, retain=True)