Regular Expressions ======= ===========
'''
Basics
======
abcd exact match of some string 'abcd'
a.cd dot matches any character (except a newline)
DOTALL: allows the dot to match newlines
[aeiou] match anything inside the []
[A-Za-z0-9-,] matches anything [ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,]
[^aeiou] match anything NOT in [aeiou]
abc|def match either abc or def
Quantifier (how many times)
===========================
a* zero or more
a+ one or more
a? zero or one
a{m,n} match between m and n
(Make quantifiers non-greedy:)
a*?
a+?
a{m,n}?
Character Ranges
================
\s [ \t\f\v\r\n] any whitespace
\d [0-9] any digit
\w [a-zA-Z0-9_] any word character
with .LOCALE [0-9_] | alphanumeric
with .UNICODE [0-9_] | alphanumeric
\S [^ \t\f\v\r\n] any NON-whitespace
\D [^0-9] any NON-digit
\W [^a-zA-Z0-9_] any NON-word
Position
========
^ beginning of a string
$ end of the string
\b match on a word boundary
match object
============
mo.start(), mo.end() starting and ending index of the match
mo.span() (start, end) index as a tuple
mo.groups() all the capture groups
mo.group(0) entire string that matched
mo.group(x) x>0, some specific capture group
`re` functions
==============
# re.match
re.search - searches for some pattern in some string
returns a "match object"
re.findall - finds all the matches (as a list of strings)
re.finditer - find all the matches (as an iterable of match objects)
re.fullmatch - match the pattern against the entire string
returning a match object
flags
=====
DOTALL make the dot character match anything (incl. newlines)
IGNORECASE case insensitive
MULTILINE allow matching beyond a line-ended (^$ mean beginning/end of the string)
UNICODE unicode compatability
'''
from re import compile as re_compile, UNICODE
from collections import namedtuple
from datetime import date
DATE = r'(?P<year>\d{4}).(?P<month>\d{2}).(?P<day>\d{2})'
MESSAGE = r'(?P<message>.+?)'
DEVICE = r'device: (?P<device>[A-Za-z0-9-]+?)'
USER = r'user: (?P<user>\w+?)'
COMMENT = r'\s*(#.*?)?'
PATTERN = re_compile(
'\s+'.join([
DATE,
MESSAGE,
DEVICE,
USER
]) + COMMENT, # NOTE: no space needed before the comment
UNICODE
)
text = '''
2020/10/11 swap device device: rsw-123 user: adrian # ignore
2020-12-13 rma device device: fsw-456 user: cecilia
'''
Task = namedtuple('Task', 'date message device user')
tasks = []
for line in text.strip().splitlines():
if (mo := PATTERN.fullmatch(line)):
year, month, day = mo.group('year'), mo.group('month'), mo.group('day')
year, month, day = int(year), int(month), int(day)
message = mo.group('message').strip()
device = mo.group('device').strip()
user = mo.group('user').strip()
t = Task(date(year, month, day), message, device, user)
tasks.append(t)
print(f'{tasks = }')
cecilia_tasks = [t for t in tasks if t.user == 'cecilia']
overdue_tasks = [t for t in tasks if t.date < date.today()]
print(f'{cecilia_tasks = }')
print(f'{overdue_tasks = }')
print('c:\\users\\travis\\documents')
print('\\\\shares\\users\\travis')
# PRO-TIP #1: use r'' raw strings
# for regular expressions
print(r'C:\Users\travis\Documents')
print(r'\\shares\users\travis')
data = 123
print('%d' % 123) # printf/%-style format (introduced in Python 2)
print('{}'.format(123)) # "advanced-string formatting" (introduced in Python 2)
print(f'{data}') # f-string (introduced in Python 3.6)
t = 123, 456
print('%s' % (t,))
print('%%d %d' % 123 % 456)
# advanced-style formatting
print('{}'.format(123))
print('{:.2f}'.format(123))
print('{:,.2f}'.format(123456))
from datetime import datetime
print('{:%H:%M:%S}'.format(datetime.now()))
print('{:%H:%M:%S.%f}'.format(datetime.now()))
class T:
def __format__(self, fmt):
if fmt == '%a':
return 'aaa'
elif fmt == '%b':
return 'BBB'
x = T()
print('{:%a}'.format(x))
print('{:%b}'.format(x))
# f-string
from datetime import datetime
print(f'{datetime.now():%H:%M:%S}')
print(f'{datetime.now():%H:%M:%S.%f}')
# use advanced string formatting if the format exists separately from the data
# use f-strings if the format and the data exist concurrently
TEMPLATE = '''
The date today is: {date:%Y-%m-%d}
'''
from datetime import datetime
print(TEMPLATE.format(date=datetime.now()))
# PRO-TIP #2: use f-strings for regular expressions
from re import compile as re_compile, UNICODE
from collections import namedtuple
from datetime import date
DATE = r'(?P<year>\d{4}).(?P<month>\d{2}).(?P<day>\d{2})'
MESSAGE = r'(?P<message>.+?)'
DEVICE = r'device: (?P<device>[A-Za-z0-9-]+?)'
USER = r'user: (?P<user>\w+?)'
COMMENT = r'\s*(#.*?)?'
PATTERN = re_compile(
'\s+'.join([
DATE,
MESSAGE,
DEVICE,
USER
]) + COMMENT, # NOTE: no space needed before the comment
UNICODE
)
text = '''
2020/10/11 swap device device: rsw-123 user: adrian # ignore
2020-12-13 rma device device: fsw-456 user: cecilia
'''
class Task(namedtuple('Task', 'date message device user')):
def __new__(cls, dt, message, device, user):
if dt < date(2000, 1, 1):
raise ValueError(f'date too early {dt}')
return super().__new__(cls, dt, message, device, user)
tasks = []
for line in text.strip().splitlines():
if (mo := PATTERN.fullmatch(line)):
year, month, day = mo.group('year'), mo.group('month'), mo.group('day')
year, month, day = int(year), int(month), int(day)
message = mo.group('message').strip()
device = mo.group('device').strip()
user = mo.group('user').strip()
t = Task(date(year, month, day), message, device, user)
tasks.append(t)
print(f'{tasks = }')
cecilia_tasks = [t for t in tasks if t.user == 'cecilia']
overdue_tasks = [t for t in tasks if t.date < date.today()]
print(f'{cecilia_tasks = }')
print(f'{overdue_tasks = }')
next_half_tasks = [t._replace(date=date(2021, 1, 1)) for t in overdue_tasks]
print(f'{next_half_tasks = }')
from unicodedata import category
print(f'{category("A") = }')
print(f'{category("a") = }')
print(f'{category("0") = }')
print(f'{category("₁") = }')
print(f'{category(".") = }')
print(f'{category(" ") = }')
from unicodedata import normalize
adjectives = [
'smart',
'fast',
'handsome',
#'Brave',
#'bold',
#'élite',
]
print(f'{sorted(adjectives) = }')