diff --git a/pyproject.toml b/pyproject.toml index 7ac141c..8e40bb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,11 @@ classifiers = [ [tool.poetry.dependencies] python = "^3.5" +[tool.isort] +profile = "black" +skip_gitignore = true +line_length = 88 + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" diff --git a/simplematch.py b/simplematch.py index 281ad5b..22418a8 100644 --- a/simplematch.py +++ b/simplematch.py @@ -18,7 +18,7 @@ def register_type(name, regex, converter=str): - """ register a type to be available for the {value:type} matching syntax """ + """register a type to be available for the {value:type} matching syntax""" cleaned = TYPE_CLEANUP_REGEX.sub("(?:", regex) types[name] = Type(regex=cleaned, converter=converter) @@ -133,7 +133,7 @@ def _create_regex(self, pattern): @staticmethod def _grouplist(match): - """ extract unnamed match groups """ + """extract unnamed match groups""" # https://stackoverflow.com/a/53385788/300783 named = match.groupdict() ignored_groups = set() diff --git a/simplematch/__init__.py b/simplematch/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/simplematch/converters.py b/simplematch/converters.py new file mode 100644 index 0000000..7c85e29 --- /dev/null +++ b/simplematch/converters.py @@ -0,0 +1,136 @@ +import decimal +from ipaddress import IPv4Address + + +class QuantifierMixin: + def __init__(self, args): + pass + + +class Str(QuantifierMixin): + regex = r".*" + + @staticmethod + def to_python(value: str) -> str: + return value + + +class Int(QuantifierMixin): + regex = r"[+-]?[0-9]" + + @staticmethod + def to_python(value: str) -> int: + return int(value) + + +class Float: + regex = r"[+-]?([0-9]*[.])?[0-9]+" + + @staticmethod + def to_python(value: str) -> float: + return float(value) + + +class Decimal(Float): + @staticmethod + def to_python(value: str) -> decimal.Decimal: + return decimal.Decimal(value) + + +class FourDigitYear(Int): + regex = "[0-9]{4}" + + @staticmethod + def to_python(value: str) -> int: + return int(value) + + +class Letters(Str): + regex = r"[a-zA-Z]+" + + +class RomanNumeral(Int): + regex = r"M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})" + + +class Bitcoin(Str): + regex = r"(bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}" + + +class Email(Str): + regex = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+" + + +class Url(Str): + regex = ( + r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b" + r"([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)" + ) + + +class IpV4: + regex = ( + r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" + r"(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}" + ) + + def to_python(self, value) -> IPv4Address: + return IPv4Address(value) + + +class IpV6: + regex = ( + r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA" + r"-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){" + r"1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3" + r"}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0" + r"-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:" + r"(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5" + r"]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0" + r"-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3," + r"3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))" + ) + + +class Port: + regex = ( + r"((6553[0-5])|(655[0-2][0-9])|(65[0-4][0-9]{2})|(6[0-4][0-9]{3})|" + r"([1-5][0-9]{4})|([0-5]{0,5})|([0-9]{1,4}))" + ) + + +class MacAddress: + regex = r"[a-fA-F0-9]{2}(:[a-fA-F0-9]{2}){5}" + + +class SocialSecurityNumber(Str): + regex = r"(?!0{3})(?!6{3})[0-8]\d{2}-(?!0{2})\d{2}-(?!0{4})\d{4}" + + +class CreditCard: + regex = ( + r"(^4[0-9]{12}(?:[0-9]{3})?$)|(^(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][" + r"0-9]{2}|27[01][0-9]|2720)[0-9]{12}$)|(3[47][0-9]{13})|(^3(?:0[0-5]|[68][0-9])" + r"[0-9]{11}$)|(^6(?:011|5[0-9]{2})[0-9]{12}$)|(^(?:2131|1800|35\d{3})\d{11}$)" + ) + + +class LatLon: + regex = r"((\-?|\+?)?\d+(\.\d+)?),\s*((\-?|\+?)?\d+(\.\d+)?)" + + +class SemanticVersion: + regex = ( + r"(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)" + r"(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)" + r"(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?" + r"(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?" + ) + + +class JiraIssueTicket: + regex = r"[A-Z]{2,}-\d+" + + +class Hashtag: + regex = r"#[^ !@#$%^&*(),.?\":{}|<>]*" diff --git a/simplematch/py.typed b/simplematch/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/simplematch/simplematch.py b/simplematch/simplematch.py new file mode 100644 index 0000000..b7afb3c --- /dev/null +++ b/simplematch/simplematch.py @@ -0,0 +1,153 @@ +""" +simplematch +""" +import re +from typing import NamedTuple, Optional +from collections import defaultdict + +from . import converters as cv + + +class Block(NamedTuple): + name: Optional[str] + converter: Optional[str] + args: Optional[str] + + +def block_parser_regex(block_start_string: str = "<", block_end_string: str = ">"): + """ + Assembles a regular expression which matches wildcards (`*`) and blocks + in the form of + + `` + + Block delimiters (`<` and `>`) can be changed via the `block_start_string` and + `block_end_string` arguments. + + Matches have three captures: (`name`, `converter`, `args`). + """ + # https://regex101.com/r/xS2B04/3 + safe_chars = r"[^:\[\]%s%s]" % (block_start_string, block_end_string) + regex = re.compile( + r""" + (? str: + """ + This does two things: + 1. replaces a sm-syntax block with the regular expression given by the converter + 2. Adds the converter in the temporary list of converters + """ + # strip whitespace from within the block + name, _converter, _args = ( + x.strip() if x is not None else None for x in match.groups() + ) + # handle wildcard (*) + if name is _converter is _args is None: + return r".*" + converter = self.converters.get(_converter, cv.Str)() + self._tmp_converters[name or self.unnamed_key].append(converter) + return converter.regex + + def parse_pattern(self, pattern: str): + self._tmp_converters.clear() + result = self.block_parser_regex.sub(self._replacer, pattern) + return result, dict(self._tmp_converters) + + +DEFAULT_ENV = Environment( + block_start_string="<", + block_end_string=">", + unnamed_key="unnamed", +) + + +class Matcher: + def __init__( + self, + pattern: str = "*", + case_sensitive: bool = True, + environment=DEFAULT_ENV, + ): + self.pattern = pattern + self.case_sensitive = case_sensitive + self.environment = environment + self.regex, self.converters = self.environment.parse_pattern(pattern) + print("Regex: ", self.regex) + print("Conve: ", self.converters) + + +Matcher("*Test") +Matcher(" °C wheather ") +Matcher("<:url><:url>") + +# txt = """ +# \{test} +# {test:test[123]} +# °C +# < year : int[max=4]>-- +# <:float>*<:float><:float[ len = 2, case_sensitive]> +# <:float>\**\ +# +# """ + +# for x in DEFAULT_ENV.parse(txt): +# print(x) diff --git a/test_simplematch.py b/test_simplematch.py index b50e9f5..4e733ca 100644 --- a/test_simplematch.py +++ b/test_simplematch.py @@ -78,7 +78,7 @@ def test_simple_matching(): # should return None object if no match assert sm.match("{folder}/{filename}?{params}", "hello.js?p=1") is None - # should match strings with . (dot) and ? (question mart) sights + # should match strings with . (dot) and ? (question mark) signs assert sm.match("{folder}/{filename}?{params}", "home/hello.js?p=1") == dict( folder="home", filename="hello.js", params="p=1" ) @@ -240,6 +240,7 @@ def test_type_ccard(inp, result): ("https://xkcd.com/2293/", True), ("https://this-shouldn't.match@example.com", False), ("http://www.example.com/", True), + ("http:/ww.example.com/", False), ), ) def test_type_url(inp, is_url):