Source code for rolling_pin.conform_etl
from typing import Any, Dict, List, Union # noqa: F401
from IPython.display import HTML, Image # noqa: F401
from copy import deepcopy
from itertools import chain
from pathlib import Path
import re
from lunchbox.enforce import Enforce
from pandas import DataFrame
import lunchbox.tools as lbt
import yaml
from rolling_pin.blob_etl import BlobETL
from rolling_pin.conform_config import ConformConfig
import rolling_pin.tools as rpt
Rules = List[Dict[str, str]]
# ------------------------------------------------------------------------------
CONFORM_COLOR_SCHEME = deepcopy(rpt.COLOR_SCHEME)
CONFORM_COLOR_SCHEME.update({
'node_font': '#DE958E',
'node_value_font': '#B6ECF3',
'edge': '#DE958E',
'edge_value': '#B6ECF3',
'node_library_font': '#B6ECF3',
'node_module_font': '#DE958E',
'edge_library': '#B6ECF3',
'edge_module': '#DE958E'
})
[docs]
class ConformETL:
'''
ConformETL creates a DataFrame from a given directory of source files.
Then it generates target paths given a set of rules.
Finally, the conform method is called and the source files are copied to
their target filepaths.
'''
[docs]
@staticmethod
def _get_data(
source_rules=[], rename_rules=[], group_rules=[], line_rules=[]
):
# type: (Rules, Rules, Rules, Rules) -> DataFrame
'''
Generates DataFrame from given source_rules and then generates target
paths for them given other rules.
Args:
source_rules (Rules): A list of rules for parsing directories.
Default: [].
rename_rules (Rules): A list of rules for renaming source filepath
to target filepaths. Default: [].
group_rules (Rules): A list of rules for grouping files.
Default: [].
line_rules (Rules): A list of rules for peforming line copies on
files belonging to a given group. Default: [].
Returns:
DataFrame: Conform DataFrame.
'''
# source
source = [] # type: List[Any]
for rule in source_rules:
files = rpt.list_all_files(
rule['path'],
include_regex=rule.get('include', None),
exclude_regex=rule.get('exclude', None),
)
source.extend(files)
source = sorted([x.as_posix() for x in source])
data = DataFrame()
data['source'] = source
data['target'] = source
# rename
for rule in rename_rules:
data.target = data.target.apply(
lambda x: rpt.replace_and_format(
rule['regex'], rule['replace'], x
)
)
# group
data['groups'] = data.source.apply(lambda x: [])
for rule in group_rules:
mask = data.source \
.apply(lambda x: re.search(rule['regex'], x)) \
.astype(bool)
data.loc[mask, 'groups'] = data.groups \
.apply(lambda x: x + [rule['name']])
mask = data.groups.apply(lambda x: x == [])
data.loc[mask, 'groups'] = data.loc[mask, 'groups'] \
.apply(lambda x: ['base'])
# line
groups = set([x['group'] for x in line_rules])
data['line_rule'] = data.groups \
.apply(lambda x: len(set(x).intersection(groups)) > 0)
return data
[docs]
@classmethod
def from_yaml(cls, filepath):
# type: (Union[str, Path]) -> ConformETL
'''
Construct ConformETL instance from given yaml file.
Args:
filepath (str or Path): YAML file.
Raises:
EnforceError: If file does not end in yml or yaml.
Returns:
ConformETL: ConformETL instance.
'''
filepath = Path(filepath).as_posix()
ext = Path(filepath).suffix[1:].lower()
msg = f'{filepath} does not end in yml or yaml.'
Enforce(ext, 'in', ['yml', 'yaml'], message=msg)
# ----------------------------------------------------------------------
with open(filepath) as f:
config = yaml.safe_load(f)
return cls(**config)
[docs]
def __init__(
self, source_rules=[], rename_rules=[], group_rules=[], line_rules=[]
):
# type: (Rules, Rules, Rules, Rules) -> None
'''
Generates DataFrame from given source_rules and then generates target
paths for them given other rules.
Args:
source_rules (Rules): A list of rules for parsing directories.
Default: [].
rename_rules (Rules): A list of rules for renaming source filepath
to target filepaths. Default: [].
group_rules (Rules): A list of rules for grouping files.
Default: [].
line_rules (Rules): A list of rules for peforming line copies on
files belonging to a given group. Default: [].
Raises:
DataError: If configuration is invalid.
'''
config = dict(
source_rules=source_rules,
rename_rules=rename_rules,
group_rules=group_rules,
line_rules=line_rules,
)
cfg = ConformConfig(config)
cfg.validate()
config = cfg.to_native()
self._data = self._get_data(
source_rules=source_rules,
rename_rules=rename_rules,
group_rules=group_rules,
line_rules=line_rules,
) # type: DataFrame
self._line_rules = line_rules # type: Rules
[docs]
def __repr__(self):
# type: () -> str
'''
String representation of conform DataFrame.
Returns:
str: Table optimized for output to shell.
'''
data = self._data.copy()
data.line_rule = data.line_rule.apply(lambda x: 'X' if x else '')
data.rename(lambda x: x.upper(), axis=1, inplace=True)
output = data \
.to_string(index=False, max_colwidth=150, col_space=[50, 50, 20, 10])
return output
@property
def groups(self):
# type: () -> List[str]
'''
list[str]: List of groups found with self._data.
'''
output = self._data.groups.tolist()
output = sorted(list(set(chain(*output))))
output.remove('base')
output.insert(0, 'base')
return output
[docs]
def to_dataframe(self):
# type: () -> DataFrame
'''
Returns:
DataFrame: Copy of internal data.
'''
return self._data.copy()
[docs]
def to_blob(self):
# type: () -> BlobETL
'''
Converts self into a BlobETL object with target column as keys and
source columns as values.
Returns:
BlobETL: BlobETL of target and source filepaths.
'''
data = self._data
keys = data.target.tolist()
vals = data.source.tolist()
output = dict(zip(keys, vals))
return BlobETL(output)
[docs]
def to_html(
self, orient='lr', color_scheme=CONFORM_COLOR_SCHEME, as_png=False
):
# type: (str, Dict[str, str], bool) -> Union[Image, HTML]
'''
For use in inline rendering of graph data in Jupyter Lab.
Graph from target to source filepath. Target is in red, source is in
cyan.
Args:
orient (str, optional): Graph layout orientation. Default: lr.
Options include:
* tb - top to bottom
* bt - bottom to top
* lr - left to right
* rl - right to left
color_scheme: (dict, optional): Color scheme to be applied to graph.
Default: rolling_pin.conform_etl.CONFORM_COLOR_SCHEME
as_png (bool, optional): Display graph as a PNG image instead of
SVG. Useful for display on Github. Default: False.
Returns:
IPython.display.HTML: HTML object for inline display.
'''
return self.to_blob() \
.to_html(orient=orient, color_scheme=color_scheme, as_png=as_png)
[docs]
def conform(self, groups='all'):
# type: (Union[str, List[str]]) -> None
'''
Copies source files to target filepaths.
Args:
groups (str or list[str]): Groups of files which are to be conformed.
'all' means all groups. Default: 'all'.
'''
if isinstance(groups, str):
groups = [groups]
if groups == ['all']:
groups = self.groups
data = self.to_dataframe()
# copy files
grps = set(groups)
mask = data.groups \
.apply(lambda x: set(x).intersection(grps)) \
.apply(lambda x: len(x) > 0)
data = data[mask]
data.apply(lambda x: rpt.copy_file(x.source, x.target), axis=1)
# copy lines
data['text'] = data.source.apply(lambda x: lbt.try_(rpt.read_text, x, 'error'))
readable_mask = data.text.apply(lambda x: isinstance(x, str))
data.loc[~readable_mask, 'text'] = ''
rules = list(filter(lambda x: x['group'] in groups, self._line_rules))
for rule in rules:
mask = data.groups.apply(lambda x: rule['group'] in x)
data.loc[mask, 'text'] = data.loc[mask, 'text'].apply(
lambda x: rpt.filter_text(
x,
include_regex=rule.get('include', None),
exclude_regex=rule.get('exclude', None),
replace_regex=rule.get('regex', None),
replace_value=rule.get('replace', None),
)
)
data[readable_mask].apply(lambda x: rpt.write_text(x.text, x.target), axis=1)