Merging PR_218 openai_rev package with new streamlit chat app

This commit is contained in:
noptuno
2023-04-27 20:29:30 -04:00
parent 479b8d6d10
commit 355dee533b
8378 changed files with 2931636 additions and 3 deletions

View File

@@ -0,0 +1,29 @@
__all__ = (
"StateInline",
"text",
"text_collapse",
"link_pairs",
"escape",
"newline",
"backtick",
"emphasis",
"image",
"link",
"autolink",
"entity",
"html_inline",
"strikethrough",
)
from . import emphasis, strikethrough
from .autolink import autolink
from .backticks import backtick
from .balance_pairs import link_pairs
from .entity import entity
from .escape import escape
from .html_inline import html_inline
from .image import image
from .link import link
from .newline import newline
from .state_inline import StateInline
from .text import text
from .text_collapse import text_collapse

View File

@@ -0,0 +1,77 @@
# Process autolinks '<protocol:...>'
import re
from .state_inline import StateInline
EMAIL_RE = re.compile(
r"^([a-zA-Z0-9.!#$%&\'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)$" # noqa: E501
)
AUTOLINK_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+.\-]{1,31}):([^<>\x00-\x20]*)$")
def autolink(state: StateInline, silent: bool) -> bool:
pos = state.pos
if state.srcCharCode[pos] != 0x3C: # /* < */
return False
start = state.pos
maximum = state.posMax
while True:
pos += 1
if pos >= maximum:
return False
ch = state.srcCharCode[pos]
if ch == 0x3C: # /* < */
return False
if ch == 0x3E: # /* > */
break
url = state.src[start + 1 : pos]
if AUTOLINK_RE.search(url) is not None:
fullUrl = state.md.normalizeLink(url)
if not state.md.validateLink(fullUrl):
return False
if not silent:
token = state.push("link_open", "a", 1)
token.attrs = {"href": fullUrl}
token.markup = "autolink"
token.info = "auto"
token = state.push("text", "", 0)
token.content = state.md.normalizeLinkText(url)
token = state.push("link_close", "a", -1)
token.markup = "autolink"
token.info = "auto"
state.pos += len(url) + 2
return True
if EMAIL_RE.search(url) is not None:
fullUrl = state.md.normalizeLink("mailto:" + url)
if not state.md.validateLink(fullUrl):
return False
if not silent:
token = state.push("link_open", "a", 1)
token.attrs = {"href": fullUrl}
token.markup = "autolink"
token.info = "auto"
token = state.push("text", "", 0)
token.content = state.md.normalizeLinkText(url)
token = state.push("link_close", "a", -1)
token.markup = "autolink"
token.info = "auto"
state.pos += len(url) + 2
return True
return False

View File

@@ -0,0 +1,74 @@
# Parse backticks
import re
from .state_inline import StateInline
regex = re.compile("^ (.+) $")
def backtick(state: StateInline, silent: bool) -> bool:
pos = state.pos
ch = state.srcCharCode[pos]
# /* ` */
if ch != 0x60:
return False
start = pos
pos += 1
maximum = state.posMax
# scan marker length
while pos < maximum and (state.srcCharCode[pos] == 0x60): # /* ` */
pos += 1
marker = state.src[start:pos]
openerLength = len(marker)
if state.backticksScanned and state.backticks.get(openerLength, 0) <= start:
if not silent:
state.pending += marker
state.pos += openerLength
return True
matchStart = matchEnd = pos
# Nothing found in the cache, scan until the end of the line (or until marker is found)
while True:
try:
matchStart = state.src.index("`", matchEnd)
except ValueError:
break
matchEnd = matchStart + 1
# scan marker length
while matchEnd < maximum and (state.srcCharCode[matchEnd] == 0x60): # /* ` */
matchEnd += 1
closerLength = matchEnd - matchStart
if closerLength == openerLength:
# Found matching closer length.
if not silent:
token = state.push("code_inline", "code", 0)
token.markup = marker
token.content = state.src[pos:matchStart].replace("\n", " ")
if (
token.content.startswith(" ")
and token.content.endswith(" ")
and len(token.content.strip()) > 0
):
token.content = token.content[1:-1]
state.pos = matchEnd
return True
# Some different length found, put it in cache as upper limit of where closer can be found
state.backticks[closerLength] = matchStart
# Scanned through the end, didn't find anything
state.backticksScanned = True
if not silent:
state.pending += marker
state.pos += openerLength
return True

View File

@@ -0,0 +1,112 @@
# For each opening emphasis-like marker find a matching closing one
#
from .state_inline import StateInline
def processDelimiters(state: StateInline, delimiters, *args):
openersBottom = {}
maximum = len(delimiters)
closerIdx = 0
while closerIdx < maximum:
closer = delimiters[closerIdx]
# Length is only used for emphasis-specific "rule of 3",
# if it's not defined (in strikethrough or 3rd party plugins),
# we can default it to 0 to disable those checks.
#
closer.length = closer.length or 0
if not closer.close:
closerIdx += 1
continue
# Previously calculated lower bounds (previous fails)
# for each marker, each delimiter length modulo 3,
# and for whether this closer can be an opener;
# https://github.com/commonmark/cmark/commit/34250e12ccebdc6372b8b49c44fab57c72443460
if closer.marker not in openersBottom:
openersBottom[closer.marker] = [-1, -1, -1, -1, -1, -1]
minOpenerIdx = openersBottom[closer.marker][
(3 if closer.open else 0) + (closer.length % 3)
]
openerIdx = closerIdx - closer.jump - 1
# avoid crash if `closer.jump` is pointing outside of the array,
# e.g. for strikethrough
if openerIdx < -1:
openerIdx = -1
newMinOpenerIdx = openerIdx
while openerIdx > minOpenerIdx:
opener = delimiters[openerIdx]
if opener.marker != closer.marker:
openerIdx -= opener.jump + 1
continue
if opener.open and opener.end < 0:
isOddMatch = False
# from spec:
#
# If one of the delimiters can both open and close emphasis, then the
# sum of the lengths of the delimiter runs containing the opening and
# closing delimiters must not be a multiple of 3 unless both lengths
# are multiples of 3.
#
if opener.close or closer.open:
if (opener.length + closer.length) % 3 == 0:
if opener.length % 3 != 0 or closer.length % 3 != 0:
isOddMatch = True
if not isOddMatch:
# If previous delimiter cannot be an opener, we can safely skip
# the entire sequence in future checks. This is required to make
# sure algorithm has linear complexity (see *_*_*_*_*_... case).
#
if openerIdx > 0 and not delimiters[openerIdx - 1].open:
lastJump = delimiters[openerIdx - 1].jump + 1
else:
lastJump = 0
closer.jump = closerIdx - openerIdx + lastJump
closer.open = False
opener.end = closerIdx
opener.jump = lastJump
opener.close = False
newMinOpenerIdx = -1
break
openerIdx -= opener.jump + 1
if newMinOpenerIdx != -1:
# If match for this delimiter run failed, we want to set lower bound for
# future lookups. This is required to make sure algorithm has linear
# complexity.
#
# See details here:
# https:#github.com/commonmark/cmark/issues/178#issuecomment-270417442
#
openersBottom[closer.marker][
(3 if closer.open else 0) + ((closer.length or 0) % 3)
] = newMinOpenerIdx
closerIdx += 1
def link_pairs(state: StateInline) -> None:
tokens_meta = state.tokens_meta
maximum = len(state.tokens_meta)
processDelimiters(state, state.delimiters)
curr = 0
while curr < maximum:
curr_meta = tokens_meta[curr]
if curr_meta and "delimiters" in curr_meta:
processDelimiters(state, curr_meta["delimiters"])
curr += 1

View File

@@ -0,0 +1,101 @@
# Process *this* and _that_
#
from .state_inline import Delimiter, StateInline
def tokenize(state: StateInline, silent: bool):
"""Insert each marker as a separate text token, and add it to delimiter list"""
start = state.pos
marker = state.srcCharCode[start]
if silent:
return False
# /* _ */ /* * */
if marker != 0x5F and marker != 0x2A:
return False
scanned = state.scanDelims(state.pos, marker == 0x2A)
for i in range(scanned.length):
token = state.push("text", "", 0)
token.content = chr(marker)
state.delimiters.append(
Delimiter(
marker=marker,
length=scanned.length,
jump=i,
token=len(state.tokens) - 1,
end=-1,
open=scanned.can_open,
close=scanned.can_close,
)
)
state.pos += scanned.length
return True
def _postProcess(state, delimiters):
i = len(delimiters) - 1
while i >= 0:
startDelim = delimiters[i]
# /* _ */ /* * */
if startDelim.marker != 0x5F and startDelim.marker != 0x2A:
i -= 1
continue
# Process only opening markers
if startDelim.end == -1:
i -= 1
continue
endDelim = delimiters[startDelim.end]
# If the previous delimiter has the same marker and is adjacent to this one,
# merge those into one strong delimiter.
#
# `<em><em>whatever</em></em>` -> `<strong>whatever</strong>`
#
isStrong = (
i > 0
and delimiters[i - 1].end == startDelim.end + 1
and delimiters[i - 1].token == startDelim.token - 1
and delimiters[startDelim.end + 1].token == endDelim.token + 1
and delimiters[i - 1].marker == startDelim.marker
)
ch = chr(startDelim.marker)
token = state.tokens[startDelim.token]
token.type = "strong_open" if isStrong else "em_open"
token.tag = "strong" if isStrong else "em"
token.nesting = 1
token.markup = ch + ch if isStrong else ch
token.content = ""
token = state.tokens[endDelim.token]
token.type = "strong_close" if isStrong else "em_close"
token.tag = "strong" if isStrong else "em"
token.nesting = -1
token.markup = ch + ch if isStrong else ch
token.content = ""
if isStrong:
state.tokens[delimiters[i - 1].token].content = ""
state.tokens[delimiters[startDelim.end + 1].token].content = ""
i -= 1
i -= 1
def postProcess(state: StateInline):
"""Walk through delimiter list and replace text tokens with tags."""
_postProcess(state, state.delimiters)
for token in state.tokens_meta:
if token and "delimiters" in token:
_postProcess(state, token["delimiters"])

View File

@@ -0,0 +1,53 @@
# Process html entity - &#123;, &#xAF;, &quot;, ...
import re
from ..common.entities import entities
from ..common.utils import fromCodePoint, isValidEntityCode
from .state_inline import StateInline
DIGITAL_RE = re.compile(r"^&#((?:x[a-f0-9]{1,6}|[0-9]{1,7}));", re.IGNORECASE)
NAMED_RE = re.compile(r"^&([a-z][a-z0-9]{1,31});", re.IGNORECASE)
def entity(state: StateInline, silent: bool):
pos = state.pos
maximum = state.posMax
if state.srcCharCode[pos] != 0x26: # /* & */
return False
if (pos + 1) < maximum:
ch = state.srcCharCode[pos + 1]
if ch == 0x23: # /* # */
match = DIGITAL_RE.search(state.src[pos:])
if match:
if not silent:
match1 = match.group(1)
code = (
int(match1[1:], 16)
if match1[0].lower() == "x"
else int(match1, 10)
)
state.pending += (
fromCodePoint(code)
if isValidEntityCode(code)
else fromCodePoint(0xFFFD)
)
state.pos += len(match.group(0))
return True
else:
match = NAMED_RE.search(state.src[pos:])
if match:
if match.group(1) in entities:
if not silent:
state.pending += entities[match.group(1)]
state.pos += len(match.group(0))
return True
if not silent:
state.pending += "&"
state.pos += 1
return True

View File

@@ -0,0 +1,49 @@
"""
Process escaped chars and hardbreaks
"""
from ..common.utils import isSpace
from .state_inline import StateInline
ESCAPED = [0 for _ in range(256)]
for ch in "\\!\"#$%&'()*+,./:;<=>?@[]^_`{|}~-":
ESCAPED[ord(ch)] = 1
def escape(state: StateInline, silent: bool):
pos = state.pos
maximum = state.posMax
# /* \ */
if state.srcCharCode[pos] != 0x5C:
return False
pos += 1
if pos < maximum:
ch = state.srcCharCode[pos]
if ch < 256 and ESCAPED[ch] != 0:
if not silent:
state.pending += state.src[pos]
state.pos += 2
return True
if ch == 0x0A:
if not silent:
state.push("hardbreak", "br", 0)
pos += 1
# skip leading whitespaces from next line
while pos < maximum:
ch = state.srcCharCode[pos]
if not isSpace(ch):
break
pos += 1
state.pos = pos
return True
if not silent:
state.pending += "\\"
state.pos += 1
return True

View File

@@ -0,0 +1,42 @@
# Process html tags
from ..common.html_re import HTML_TAG_RE
from .state_inline import StateInline
def isLetter(ch: int):
lc = ch | 0x20 # to lower case
# /* a */ and /* z */
return (lc >= 0x61) and (lc <= 0x7A)
def html_inline(state: StateInline, silent: bool):
pos = state.pos
if not state.md.options.get("html", None):
return False
# Check start
maximum = state.posMax
if state.srcCharCode[pos] != 0x3C or pos + 2 >= maximum: # /* < */
return False
# Quick fail on second char
ch = state.srcCharCode[pos + 1]
if (
ch != 0x21
and ch != 0x3F # /* ! */
and ch != 0x2F # /* ? */
and not isLetter(ch) # /* / */
):
return False
match = HTML_TAG_RE.search(state.src[pos:])
if not match:
return False
if not silent:
token = state.push("html_inline", "", 0)
token.content = state.src[pos : pos + len(match.group(0))]
state.pos += len(match.group(0))
return True

View File

@@ -0,0 +1,150 @@
# Process ![image](<src> "title")
from __future__ import annotations
from ..common.utils import isSpace, normalizeReference
from ..token import Token
from .state_inline import StateInline
def image(state: StateInline, silent: bool):
label = None
href = ""
oldPos = state.pos
max = state.posMax
# /* ! */
if state.srcCharCode[state.pos] != 0x21:
return False
# /* [ */
if state.pos + 1 < state.posMax and state.srcCharCode[state.pos + 1] != 0x5B:
return False
labelStart = state.pos + 2
labelEnd = state.md.helpers.parseLinkLabel(state, state.pos + 1, False)
# parser failed to find ']', so it's not a valid link
if labelEnd < 0:
return False
pos = labelEnd + 1
# /* ( */
if pos < max and state.srcCharCode[pos] == 0x28:
#
# Inline link
#
# [link]( <href> "title" )
# ^^ skipping these spaces
pos += 1
while pos < max:
code = state.srcCharCode[pos]
if not isSpace(code) and code != 0x0A:
break
pos += 1
if pos >= max:
return False
# [link]( <href> "title" )
# ^^^^^^ parsing link destination
start = pos
res = state.md.helpers.parseLinkDestination(state.src, pos, state.posMax)
if res.ok:
href = state.md.normalizeLink(res.str)
if state.md.validateLink(href):
pos = res.pos
else:
href = ""
# [link]( <href> "title" )
# ^^ skipping these spaces
start = pos
while pos < max:
code = state.srcCharCode[pos]
if not isSpace(code) and code != 0x0A:
break
pos += 1
# [link]( <href> "title" )
# ^^^^^^^ parsing link title
res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax)
if pos < max and start != pos and res.ok:
title = res.str
pos = res.pos
# [link]( <href> "title" )
# ^^ skipping these spaces
while pos < max:
code = state.srcCharCode[pos]
if not isSpace(code) and code != 0x0A:
break
pos += 1
else:
title = ""
# /* ) */
if pos >= max or state.srcCharCode[pos] != 0x29:
state.pos = oldPos
return False
pos += 1
else:
#
# Link reference
#
if "references" not in state.env:
return False
# /* [ */
if pos < max and state.srcCharCode[pos] == 0x5B:
start = pos + 1
pos = state.md.helpers.parseLinkLabel(state, pos)
if pos >= 0:
label = state.src[start:pos]
pos += 1
else:
pos = labelEnd + 1
else:
pos = labelEnd + 1
# covers label == '' and label == undefined
# (collapsed reference link and shortcut reference link respectively)
if not label:
label = state.src[labelStart:labelEnd]
label = normalizeReference(label)
ref = state.env["references"].get(label, None)
if not ref:
state.pos = oldPos
return False
href = ref["href"]
title = ref["title"]
#
# We found the end of the link, and know for a fact it's a valid link
# so all that's left to do is to call tokenizer.
#
if not silent:
content = state.src[labelStart:labelEnd]
tokens: list[Token] = []
state.md.inline.parse(content, state.md, state.env, tokens)
token = state.push("image", "img", 0)
token.attrs = {"src": href, "alt": ""}
token.children = tokens or None
token.content = content
if title:
token.attrSet("title", title)
# note, this is not part of markdown-it JS, but is useful for renderers
if label and state.md.options.get("store_labels", False):
token.meta["label"] = label
state.pos = pos
state.posMax = max
return True

View File

@@ -0,0 +1,149 @@
# Process [link](<to> "stuff")
from ..common.utils import isSpace, normalizeReference
from .state_inline import StateInline
def link(state: StateInline, silent: bool):
href = ""
title = ""
label = None
oldPos = state.pos
maximum = state.posMax
start = state.pos
parseReference = True
if state.srcCharCode[state.pos] != 0x5B: # /* [ */
return False
labelStart = state.pos + 1
labelEnd = state.md.helpers.parseLinkLabel(state, state.pos, True)
# parser failed to find ']', so it's not a valid link
if labelEnd < 0:
return False
pos = labelEnd + 1
if pos < maximum and state.srcCharCode[pos] == 0x28: # /* ( */
#
# Inline link
#
# might have found a valid shortcut link, disable reference parsing
parseReference = False
# [link]( <href> "title" )
# ^^ skipping these spaces
pos += 1
while pos < maximum:
code = state.srcCharCode[pos]
if not isSpace(code) and code != 0x0A:
break
pos += 1
if pos >= maximum:
return False
# [link]( <href> "title" )
# ^^^^^^ parsing link destination
start = pos
res = state.md.helpers.parseLinkDestination(state.src, pos, state.posMax)
if res.ok:
href = state.md.normalizeLink(res.str)
if state.md.validateLink(href):
pos = res.pos
else:
href = ""
# [link]( <href> "title" )
# ^^ skipping these spaces
start = pos
while pos < maximum:
code = state.srcCharCode[pos]
if not isSpace(code) and code != 0x0A:
break
pos += 1
# [link]( <href> "title" )
# ^^^^^^^ parsing link title
res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax)
if pos < maximum and start != pos and res.ok:
title = res.str
pos = res.pos
# [link]( <href> "title" )
# ^^ skipping these spaces
while pos < maximum:
code = state.srcCharCode[pos]
if not isSpace(code) and code != 0x0A:
break
pos += 1
if pos >= maximum or state.srcCharCode[pos] != 0x29: # /* ) */
# parsing a valid shortcut link failed, fallback to reference
parseReference = True
pos += 1
if parseReference:
#
# Link reference
#
if "references" not in state.env:
return False
if pos < maximum and state.srcCharCode[pos] == 0x5B: # /* [ */
start = pos + 1
pos = state.md.helpers.parseLinkLabel(state, pos)
if pos >= 0:
label = state.src[start:pos]
pos += 1
else:
pos = labelEnd + 1
else:
pos = labelEnd + 1
# covers label == '' and label == undefined
# (collapsed reference link and shortcut reference link respectively)
if not label:
label = state.src[labelStart:labelEnd]
label = normalizeReference(label)
ref = (
state.env["references"][label] if label in state.env["references"] else None
)
if not ref:
state.pos = oldPos
return False
href = ref["href"]
title = ref["title"]
#
# We found the end of the link, and know for a fact it's a valid link
# so all that's left to do is to call tokenizer.
#
if not silent:
state.pos = labelStart
state.posMax = labelEnd
token = state.push("link_open", "a", 1)
token.attrs = {"href": href}
if title:
token.attrSet("title", title)
# note, this is not part of markdown-it JS, but is useful for renderers
if label and state.md.options.get("store_labels", False):
token.meta["label"] = label
state.md.inline.tokenize(state)
token = state.push("link_close", "a", -1)
state.pos = pos
state.posMax = maximum
return True

View File

@@ -0,0 +1,43 @@
# Proceess '\n'
import re
from ..common.utils import charCodeAt, isSpace
from .state_inline import StateInline
endSpace = re.compile(r" +$")
def newline(state: StateInline, silent: bool):
pos = state.pos
# /* \n */
if state.srcCharCode[pos] != 0x0A:
return False
pmax = len(state.pending) - 1
maximum = state.posMax
# ' \n' -> hardbreak
# Lookup in pending chars is bad practice! Don't copy to other rules!
# Pending string is stored in concat mode, indexed lookups will cause
# conversion to flat mode.
if not silent:
if pmax >= 0 and charCodeAt(state.pending, pmax) == 0x20:
if pmax >= 1 and charCodeAt(state.pending, pmax - 1) == 0x20:
state.pending = endSpace.sub("", state.pending)
state.push("hardbreak", "br", 0)
else:
state.pending = state.pending[:-1]
state.push("softbreak", "br", 0)
else:
state.push("softbreak", "br", 0)
pos += 1
# skip heading spaces for next line
while pos < maximum and isSpace(state.srcCharCode[pos]):
pos += 1
state.pos = pos
return True

View File

@@ -0,0 +1,175 @@
from __future__ import annotations
from collections import namedtuple
from collections.abc import MutableMapping
from dataclasses import dataclass
from typing import TYPE_CHECKING
from .._compat import DATACLASS_KWARGS
from ..common.utils import isMdAsciiPunct, isPunctChar, isWhiteSpace
from ..ruler import StateBase
from ..token import Token
if TYPE_CHECKING:
from markdown_it import MarkdownIt
@dataclass(**DATACLASS_KWARGS)
class Delimiter:
# Char code of the starting marker (number).
marker: int
# Total length of these series of delimiters.
length: int
# An amount of characters before this one that's equivalent to
# current one. In plain English: if this delimiter does not open
# an emphasis, neither do previous `jump` characters.
#
# Used to skip sequences like "*****" in one step, for 1st asterisk
# value will be 0, for 2nd it's 1 and so on.
jump: int
# A position of the token this delimiter corresponds to.
token: int
# If this delimiter is matched as a valid opener, `end` will be
# equal to its position, otherwise it's `-1`.
end: int
# Boolean flags that determine if this delimiter could open or close
# an emphasis.
open: bool
close: bool
level: bool | None = None
Scanned = namedtuple("Scanned", ["can_open", "can_close", "length"])
class StateInline(StateBase):
def __init__(
self, src: str, md: MarkdownIt, env: MutableMapping, outTokens: list[Token]
):
self.src = src
self.env = env
self.md = md
self.tokens = outTokens
self.tokens_meta: list[dict | None] = [None] * len(outTokens)
self.pos = 0
self.posMax = len(self.src)
self.level = 0
self.pending = ""
self.pendingLevel = 0
# Stores { start: end } pairs. Useful for backtrack
# optimization of pairs parse (emphasis, strikes).
self.cache: dict[int, int] = {}
# List of emphasis-like delimiters for current tag
self.delimiters: list[Delimiter] = []
# Stack of delimiter lists for upper level tags
self._prev_delimiters: list[list[Delimiter]] = []
# backticklength => last seen position
self.backticks: dict[int, int] = {}
self.backticksScanned = False
def __repr__(self):
return (
f"{self.__class__.__name__}"
f"(pos=[{self.pos} of {self.posMax}], token={len(self.tokens)})"
)
def pushPending(self):
token = Token("text", "", 0)
token.content = self.pending
token.level = self.pendingLevel
self.tokens.append(token)
self.pending = ""
return token
def push(self, ttype, tag, nesting):
"""Push new token to "stream".
If pending text exists - flush it as text token
"""
if self.pending:
self.pushPending()
token = Token(ttype, tag, nesting)
token_meta = None
if nesting < 0:
# closing tag
self.level -= 1
self.delimiters = self._prev_delimiters.pop()
token.level = self.level
if nesting > 0:
# opening tag
self.level += 1
self._prev_delimiters.append(self.delimiters)
self.delimiters = []
token_meta = {"delimiters": self.delimiters}
self.pendingLevel = self.level
self.tokens.append(token)
self.tokens_meta.append(token_meta)
return token
def scanDelims(self, start, canSplitWord):
"""
Scan a sequence of emphasis-like markers, and determine whether
it can start an emphasis sequence or end an emphasis sequence.
- start - position to scan from (it should point at a valid marker);
- canSplitWord - determine if these markers can be found inside a word
"""
pos = start
left_flanking = True
right_flanking = True
maximum = self.posMax
marker = self.srcCharCode[start]
# treat beginning of the line as a whitespace
lastChar = self.srcCharCode[start - 1] if start > 0 else 0x20
while pos < maximum and self.srcCharCode[pos] == marker:
pos += 1
count = pos - start
# treat end of the line as a whitespace
nextChar = self.srcCharCode[pos] if pos < maximum else 0x20
isLastPunctChar = isMdAsciiPunct(lastChar) or isPunctChar(chr(lastChar))
isNextPunctChar = isMdAsciiPunct(nextChar) or isPunctChar(chr(nextChar))
isLastWhiteSpace = isWhiteSpace(lastChar)
isNextWhiteSpace = isWhiteSpace(nextChar)
if isNextWhiteSpace:
left_flanking = False
elif isNextPunctChar:
if not (isLastWhiteSpace or isLastPunctChar):
left_flanking = False
if isLastWhiteSpace:
right_flanking = False
elif isLastPunctChar:
if not (isNextWhiteSpace or isNextPunctChar):
right_flanking = False
if not canSplitWord:
can_open = left_flanking and ((not right_flanking) or isLastPunctChar)
can_close = right_flanking and ((not left_flanking) or isNextPunctChar)
else:
can_open = left_flanking
can_close = right_flanking
return Scanned(can_open, can_close, count)

View File

@@ -0,0 +1,131 @@
# ~~strike through~~
from __future__ import annotations
from .state_inline import Delimiter, StateInline
def tokenize(state: StateInline, silent: bool):
"""Insert each marker as a separate text token, and add it to delimiter list"""
start = state.pos
marker = state.srcCharCode[start]
if silent:
return False
if marker != 0x7E: # /* ~ */
return False
scanned = state.scanDelims(state.pos, True)
length = scanned.length
ch = chr(marker)
if length < 2:
return False
if length % 2:
token = state.push("text", "", 0)
token.content = ch
length -= 1
i = 0
while i < length:
token = state.push("text", "", 0)
token.content = ch + ch
state.delimiters.append(
Delimiter(
**{
"marker": marker,
"length": 0, # disable "rule of 3" length checks meant for emphasis
"jump": i // 2, # for `~~` 1 marker = 2 characters
"token": len(state.tokens) - 1,
"end": -1,
"open": scanned.can_open,
"close": scanned.can_close,
}
)
)
i += 2
state.pos += scanned.length
return True
def _postProcess(state: StateInline, delimiters: list[Delimiter]):
loneMarkers = []
maximum = len(delimiters)
i = 0
while i < maximum:
startDelim = delimiters[i]
if startDelim.marker != 0x7E: # /* ~ */
i += 1
continue
if startDelim.end == -1:
i += 1
continue
endDelim = delimiters[startDelim.end]
token = state.tokens[startDelim.token]
token.type = "s_open"
token.tag = "s"
token.nesting = 1
token.markup = "~~"
token.content = ""
token = state.tokens[endDelim.token]
token.type = "s_close"
token.tag = "s"
token.nesting = -1
token.markup = "~~"
token.content = ""
if (
state.tokens[endDelim.token - 1].type == "text"
and state.tokens[endDelim.token - 1].content == "~"
):
loneMarkers.append(endDelim.token - 1)
i += 1
# If a marker sequence has an odd number of characters, it's split
# like this: `~~~~~` -> `~` + `~~` + `~~`, leaving one marker at the
# start of the sequence.
#
# So, we have to move all those markers after subsequent s_close tags.
#
while loneMarkers:
i = loneMarkers.pop()
j = i + 1
while (j < len(state.tokens)) and (state.tokens[j].type == "s_close"):
j += 1
j -= 1
if i != j:
token = state.tokens[j]
state.tokens[j] = state.tokens[i]
state.tokens[i] = token
def postProcess(state: StateInline):
"""Walk through delimiter list and replace text tokens with tags."""
tokens_meta = state.tokens_meta
maximum = len(state.tokens_meta)
_postProcess(state, state.delimiters)
curr = 0
while curr < maximum:
try:
curr_meta = tokens_meta[curr]
except IndexError:
pass
else:
if curr_meta and "delimiters" in curr_meta:
_postProcess(state, curr_meta["delimiters"])
curr += 1

View File

@@ -0,0 +1,57 @@
# Skip text characters for text token, place those to pending buffer
# and increment current pos
from .state_inline import StateInline
# Rule to skip pure text
# '{}$%@~+=:' reserved for extensions
# !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~
# !!!! Don't confuse with "Markdown ASCII Punctuation" chars
# http://spec.commonmark.org/0.15/#ascii-punctuation-character
def isTerminatorChar(ch):
return ch in {
0x0A, # /* \n */:
0x21, # /* ! */:
0x23, # /* # */:
0x24, # /* $ */:
0x25, # /* % */:
0x26, # /* & */:
0x2A, # /* * */:
0x2B, # /* + */:
0x2D, # /* - */:
0x3A, # /* : */:
0x3C, # /* < */:
0x3D, # /* = */:
0x3E, # /* > */:
0x40, # /* @ */:
0x5B, # /* [ */:
0x5C, # /* \ */:
0x5D, # /* ] */:
0x5E, # /* ^ */:
0x5F, # /* _ */:
0x60, # /* ` */:
0x7B, # /* { */:
0x7D, # /* } */:
0x7E, # /* ~ */:
}
def text(state: StateInline, silent: bool, **args):
pos = state.pos
posMax = state.posMax
while (pos < posMax) and not isTerminatorChar(state.srcCharCode[pos]):
pos += 1
if pos == state.pos:
return False
if not silent:
state.pending += state.src[state.pos : pos]
state.pos = pos
return True

View File

@@ -0,0 +1,43 @@
from .state_inline import StateInline
def text_collapse(state: StateInline, *args):
"""
Clean up tokens after emphasis and strikethrough postprocessing:
merge adjacent text nodes into one and re-calculate all token levels
This is necessary because initially emphasis delimiter markers (``*, _, ~``)
are treated as their own separate text tokens. Then emphasis rule either
leaves them as text (needed to merge with adjacent text) or turns them
into opening/closing tags (which messes up levels inside).
"""
level = 0
maximum = len(state.tokens)
curr = last = 0
while curr < maximum:
# re-calculate levels after emphasis/strikethrough turns some text nodes
# into opening/closing tags
if state.tokens[curr].nesting < 0:
level -= 1 # closing tag
state.tokens[curr].level = level
if state.tokens[curr].nesting > 0:
level += 1 # opening tag
if (
state.tokens[curr].type == "text"
and curr + 1 < maximum
and state.tokens[curr + 1].type == "text"
):
# collapse two adjacent text nodes
state.tokens[curr + 1].content = (
state.tokens[curr].content + state.tokens[curr + 1].content
)
else:
if curr != last:
state.tokens[last] = state.tokens[curr]
last += 1
curr += 1
if curr != last:
del state.tokens[last:]