seminars.fb

Regular Expressions ======= ===========

'''
Basics
======
abcd           exact match of some string 'abcd'
a.cd           dot matches any character (except a newline)
                  DOTALL: allows the dot to match newlines
[aeiou]        match anything inside the []
[A-Za-z0-9-,]  matches anything [ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,]
[^aeiou]       match anything NOT in [aeiou]
abc|def        match either abc or def

Quantifier (how many times)
===========================
a*       zero or more
a+       one or more
a?       zero or one
a{m,n}   match between m and n

(Make quantifiers non-greedy:)
a*?
a+?
a{m,n}?

Character Ranges
================
\s      [ \t\f\v\r\n] any whitespace
\d      [0-9]         any digit
\w      [a-zA-Z0-9_]  any word character
          with .LOCALE  [0-9_] | alphanumeric
          with .UNICODE [0-9_] | alphanumeric
\S      [^ \t\f\v\r\n]  any NON-whitespace
\D      [^0-9]          any NON-digit
\W      [^a-zA-Z0-9_]   any NON-word

Position
========
^   beginning of a string
$   end of the string
\b  match on a word boundary

match object
============
mo.start(), mo.end()   starting and ending index of the match
mo.span()              (start, end) index as a tuple
mo.groups()            all the capture groups
mo.group(0)            entire string that matched
mo.group(x)            x>0, some specific capture group

`re` functions
==============
# re.match
re.search    - searches for some pattern in some string
               returns a "match object"
re.findall   - finds all the matches (as a list of strings)
re.finditer  - find all the matches (as an iterable of match objects)
re.fullmatch - match the pattern against the entire string
               returning a match object

flags
=====
DOTALL      make the dot character match anything (incl. newlines)
IGNORECASE  case insensitive
MULTILINE   allow matching beyond a line-ended (^$ mean beginning/end of the string)
UNICODE     unicode compatability
'''

from re import compile as re_compile, UNICODE
from collections import namedtuple
from datetime import date

DATE    = r'(?P<year>\d{4}).(?P<month>\d{2}).(?P<day>\d{2})'
MESSAGE = r'(?P<message>.+?)'
DEVICE  = r'device: (?P<device>[A-Za-z0-9-]+?)'
USER    = r'user: (?P<user>\w+?)'
COMMENT = r'\s*(#.*?)?'
PATTERN = re_compile(
    '\s+'.join([
        DATE,
        MESSAGE,
        DEVICE,
        USER
    ]) + COMMENT, # NOTE: no space needed before the comment
    UNICODE
)
text = '''
2020/10/11  swap device  device: rsw-123  user: adrian   # ignore
2020-12-13  rma device   device: fsw-456  user: cecilia
'''

Task = namedtuple('Task', 'date message device user')

tasks = []
for line in text.strip().splitlines():
    if (mo := PATTERN.fullmatch(line)):
        year, month, day = mo.group('year'), mo.group('month'), mo.group('day')
        year, month, day = int(year), int(month), int(day)
        message = mo.group('message').strip()
        device = mo.group('device').strip()
        user = mo.group('user').strip()
        t = Task(date(year, month, day), message, device, user)
        tasks.append(t)

print(f'{tasks = }')

cecilia_tasks = [t for t in tasks if t.user == 'cecilia']
overdue_tasks = [t for t in tasks if t.date < date.today()]

print(f'{cecilia_tasks = }')
print(f'{overdue_tasks = }')

print('c:\\users\\travis\\documents')
print('\\\\shares\\users\\travis')

# PRO-TIP #1: use r'' raw strings
#             for regular expressions
print(r'C:\Users\travis\Documents')
print(r'\\shares\users\travis')

data = 123
print('%d' % 123)       # printf/%-style format (introduced in Python 2)
print('{}'.format(123)) # "advanced-string formatting" (introduced in Python 2)
print(f'{data}')        # f-string (introduced in Python 3.6)

t = 123, 456
print('%s' % (t,))
print('%%d %d' % 123 % 456)

# advanced-style formatting
print('{}'.format(123))
print('{:.2f}'.format(123))
print('{:,.2f}'.format(123456))

from datetime import datetime
print('{:%H:%M:%S}'.format(datetime.now()))
print('{:%H:%M:%S.%f}'.format(datetime.now()))


class T:
    def __format__(self, fmt):
        if fmt == '%a':
            return 'aaa'
        elif fmt == '%b':
            return 'BBB'
x = T()
print('{:%a}'.format(x))
print('{:%b}'.format(x))

# f-string

from datetime import datetime
print(f'{datetime.now():%H:%M:%S}')
print(f'{datetime.now():%H:%M:%S.%f}')

# use advanced string formatting if the format exists separately from the data
# use f-strings if the format and the data exist concurrently

TEMPLATE = '''
The date today is: {date:%Y-%m-%d}
'''

from datetime import datetime
print(TEMPLATE.format(date=datetime.now()))

# PRO-TIP #2: use f-strings for regular expressions

from re import compile as re_compile, UNICODE
from collections import namedtuple
from datetime import date

DATE    = r'(?P<year>\d{4}).(?P<month>\d{2}).(?P<day>\d{2})'
MESSAGE = r'(?P<message>.+?)'
DEVICE  = r'device: (?P<device>[A-Za-z0-9-]+?)'
USER    = r'user: (?P<user>\w+?)'
COMMENT = r'\s*(#.*?)?'
PATTERN = re_compile(
    '\s+'.join([
        DATE,
        MESSAGE,
        DEVICE,
        USER
    ]) + COMMENT, # NOTE: no space needed before the comment
    UNICODE
)
text = '''
2020/10/11  swap device  device: rsw-123  user: adrian   # ignore
2020-12-13  rma device   device: fsw-456  user: cecilia
'''

class Task(namedtuple('Task', 'date message device user')):
    def __new__(cls, dt, message, device, user):
        if dt < date(2000, 1, 1):
            raise ValueError(f'date too early {dt}')
        return super().__new__(cls, dt, message, device, user)

tasks = []
for line in text.strip().splitlines():
    if (mo := PATTERN.fullmatch(line)):
        year, month, day = mo.group('year'), mo.group('month'), mo.group('day')
        year, month, day = int(year), int(month), int(day)
        message = mo.group('message').strip()
        device = mo.group('device').strip()
        user = mo.group('user').strip()
        t = Task(date(year, month, day), message, device, user)
        tasks.append(t)

print(f'{tasks = }')

cecilia_tasks = [t for t in tasks if t.user == 'cecilia']
overdue_tasks = [t for t in tasks if t.date < date.today()]

print(f'{cecilia_tasks = }')
print(f'{overdue_tasks = }')

next_half_tasks = [t._replace(date=date(2021, 1, 1)) for t in overdue_tasks]

print(f'{next_half_tasks = }')

from unicodedata import category
print(f'{category("A") = }')
print(f'{category("a") = }')
print(f'{category("0") = }')
print(f'{category("₁") = }')
print(f'{category(".") = }')
print(f'{category(" ") = }')

from unicodedata import normalize

adjectives = [
    'smart',
    'fast',
    'handsome',
    #'Brave',
    #'bold',
    #'élite',
]

print(f'{sorted(adjectives) = }')