Source code for feathr.definition.feature

import re
from copy import deepcopy
from typing import List, Optional, Union, Dict

from jinja2 import Template

from feathr.definition.dtype import FeatureType
from feathr.definition.transformation import ExpressionTransformation, Transformation, WindowAggTransformation
from feathr.definition.typed_key import DUMMY_KEY, TypedKey
from feathr.definition.feathrconfig import HoconConvertible
from feathr.api.app.core.feathr_api_exception import FeatureNameValidationError


class FeatureBase(HoconConvertible):
    """The base class for features
    It has a feature name, feature type, and a convenient transformation used to produce its feature value.

    Attributes:
        name: Unique name of the feature. Only alphabet, numbers, and '_' are allowed in the name.
                It can not start with numbers. Note that '.' is NOT ALLOWED!
        feature_type: the feature value type. e.g. INT32, FLOAT, etc. feathr.dtype
        key: The key of this feature. e.g. user_id.
        transform: A transformation used to produce its feature value. e.g. amount * 10
        registry_tags: A dict of (str, str) that you can pass to feature registry for better organization. For example, you can use {"deprecated": "true"} to indicate this feature is deprecated, etc.
    """
    def __init__(self,
                 name: str,
                 feature_type: FeatureType,
                 transform: Optional[Union[str, Transformation]] = None,
                 key: Optional[Union[TypedKey, List[TypedKey]]] = [DUMMY_KEY],
                 registry_tags: Optional[Dict[str, str]] = None,
                 ):
        FeatureBase.validate_feature_name(name)
        self.name = name
        self.feature_type = feature_type
        self.registry_tags=registry_tags
        self.key = key if isinstance(key, List) else [key]
        # feature_alias: Rename the derived feature to `feature_alias`. Default to feature name.
        self.feature_alias = name
        # If no transformation is specified, default to referencing the a field with the same name
        if transform is None:
            self.transform = ExpressionTransformation(name)
        elif isinstance(transform, str):
            self.transform = ExpressionTransformation(transform)
        else:
            self.transform = transform
        # An alias for the key in this feature. Default to its key column alias. Useful in derived features.
        # self.key could be null, when getting features from registry.
        self.key_alias = [k.key_column_alias for k in self.key if k]

    @classmethod
    def validate_feature_name(cls, feature_name: str) -> bool:
        """
        Only alphabet, numbers, and '_' are allowed in the name.
        It can not start with numbers. Note that '.' is NOT ALLOWED!
        This is because some compute engines, such as Spark, will consider them as operators in feature name.
        """
        if not feature_name:
            raise FeatureNameValidationError('Feature name rule violation: empty feature name detected')

        feature_validator = re.compile(r"""^              # from the start of the string
                                           [a-zA-Z_]{1}   # first character can only be a letter or underscore 
                                           [a-zA-Z0-9_]+  # as many letters, numbers, or underscores as you like  
                                           $""",          # to the end of the string
                                       re.X)

        if not feature_validator.match(feature_name):
            raise FeatureNameValidationError(
                'Feature name rule violation: only letters, numbers, and underscores are allowed in the name, ' +
                f'and the name cannot start with a number. name={feature_name}')

        return True

    def with_key(self, key_alias: Union[str, List[str]]):
        """Rename the feature key with the alias. This is useful in derived features that depends on
        the same feature with different keys."""
        cleaned_key_alias = [key_alias] if isinstance(key_alias, str) else key_alias
        assert(len(cleaned_key_alias) == len(self.key))
        new_key = []
        for i in range(0, len(cleaned_key_alias)):
            typed_key = deepcopy(self.key[i])
            typed_key.key_column_alias = cleaned_key_alias[i]
            new_key.append(typed_key)

        res = deepcopy(self)
        res.key = new_key
        res.key_alias = cleaned_key_alias
        return res

    def as_feature(self, feature_alias):
        """Provide the feature a different alias, which can be used to reference the feature in transformation
        expression. This is useful in derived features that depends on the same feature with different keys."""
        new_feature = deepcopy(self)
        new_feature.feature_alias = feature_alias
        return new_feature


[docs]class Feature(FeatureBase): """A feature is an individual measurable property or characteristic of an entity. It has a feature name, feature type, and a convenient row transformation used to produce its feature value. Attributes: name: Unique name of the feature. Only alphabet, numbers, and '_' are allowed in the name. It can not start with numbers. Note that '.' is NOT ALLOWED! feature_type: the feature value type. e.g. INT32, FLOAT, etc. Should be part of `feathr.dtype` key: The key of this feature. e.g. user_id. transform: A row transformation used to produce its feature value. e.g. amount * 10 registry_tags: A dict of (str, str) that you can pass to feature registry for better organization. For example, you can use {"deprecated": "true"} to indicate this feature is deprecated, etc. """ def __init__(self, name: str, feature_type: FeatureType, key: Optional[Union[TypedKey, List[TypedKey]]] = [DUMMY_KEY], transform: Optional[Union[str, Transformation]] = None, registry_tags: Optional[Dict[str, str]] = None, ): super(Feature, self).__init__(name, feature_type, transform, key, registry_tags)
[docs] def to_feature_config(self) -> str: tm = Template(""" {{feature.name}}: { {{feature.transform.to_feature_config()}} {{feature.feature_type.to_feature_config()}} } """) return tm.render(feature=self)