Source code for feathr.client

import base64
import logging
import os
import tempfile
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Union
from feathr.definition.feature import FeatureBase

import redis
from azure.identity import DefaultAzureCredential
from jinja2 import Template
from pyhocon import ConfigFactory

from feathr.spark_provider._databricks_submission import _FeathrDatabricksJobLauncher

from feathr.registry._feature_registry_purview import _FeatureRegistry
from feathr.definition._materialization_utils import _to_materialization_config
from feathr.udf._preprocessing_pyudf_manager import _PreprocessingPyudfManager
from feathr.spark_provider._synapse_submission import _FeathrSynapseJobLauncher
from feathr.constants import *
from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration
from feathr.definition.feature_derivations import DerivedFeature
from feathr.definition.materialization_settings import MaterializationSettings
from feathr.protobuf.featureValue_pb2 import FeatureValue
from feathr.definition.query_feature_list import FeatureQuery
from feathr.definition.settings import ObservationSettings
from feathr.definition.feature_derivations import DerivedFeature
from feathr.definition.anchor import FeatureAnchor
from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration
from feathr.utils._envvariableutil import _EnvVaraibleUtil
from feathr.utils._file_utils import write_to_file
from feathr.utils.feature_printer import FeaturePrinter


class FeatureJoinJobParams:
    """Parameters related to feature join job.

    Attributes:
        join_config_path: Path to the join config.
        observation_path: Absolute path in Cloud to the observation data path.
        feature_config: Path to the features config.
        job_output_path: Absolute path in Cloud that you want your output data to be in.
    """

    def __init__(self, join_config_path, observation_path, feature_config, job_output_path):
        self.join_config_path = join_config_path
        self.observation_path = observation_path
        self.feature_config = feature_config
        self.job_output_path = job_output_path


class FeatureGenerationJobParams:
    """Parameters related to feature generation job.

    Attributes:
        generation_config_path: Path to the feature generation config.
        feature_config: Path to the features config.
    """

    def __init__(self, generation_config_path, feature_config):
        self.generation_config_path = generation_config_path
        self.feature_config = feature_config



[docs]class FeathrClient(object): """Feathr client. The client is used to create training dataset, materialize features, register features, and fetch features from the online storage. For offline storage and compute engine, Azure ADLS, AWS S3 and Azure Synapse are supported. For online storage, currently only Redis is supported. The users of this client is responsible for set up all the necessary information needed to start a Redis client via environment variable or a Spark cluster. Host address, port and password are needed to start the Redis client. Attributes: config_path (str, optional): config path. See [Feathr Config Template](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for more details. Defaults to "./feathr_config.yaml". local_workspace_dir (str, optional): set where is the local work space dir. If not set, Feathr will create a temporary folder to store local workspace related files. credential (optional): credential to access cloud resources, most likely to be the returned result of DefaultAzureCredential(). If not set, Feathr will initialize DefaultAzureCredential() inside the __init__ function to get credentials. project_registry_tag (Dict[str, str]): adding tags for project in Feathr registry. This might be useful if you want to tag your project as deprecated, or allow certain customizations on project leve. Default is empty Raises: RuntimeError: Fail to create the client since necessary environment variables are not set for Redis client creation. """ def __init__(self, config_path:str = "./feathr_config.yaml", local_workspace_dir: str = None, credential=None, project_registry_tag: Dict[str, str]=None): self.logger = logging.getLogger(__name__) # Redis key separator self._KEY_SEPARATOR = ':' envutils = _EnvVaraibleUtil(config_path) if local_workspace_dir: self.local_workspace_dir = local_workspace_dir else: # this is required for Windows tem_dir_obj = tempfile.TemporaryDirectory() self.local_workspace_dir = tem_dir_obj.name self.envutils = envutils if not os.path.exists(config_path): self.logger.warning('Configuration path does not exist, you need to set the environment variables explicitly. For all the environment variables, please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml') # Load all configs from yaml at initialization # DO NOT load any configs from yaml during runtime. self.project_name = envutils.get_environment_variable_with_default( 'project_config', 'project_name') # Redis configs self.redis_host = envutils.get_environment_variable_with_default( 'online_store', 'redis', 'host') self.redis_port = envutils.get_environment_variable_with_default( 'online_store', 'redis', 'port') self.redis_ssl_enabled = envutils.get_environment_variable_with_default( 'online_store', 'redis', 'ssl_enabled') # S3 configs self.s3_endpoint = envutils.get_environment_variable_with_default( 'offline_store', 's3', 's3_endpoint') # spark configs self.output_num_parts = envutils.get_environment_variable_with_default( 'spark_config', 'spark_result_output_parts') self.spark_runtime = envutils.get_environment_variable_with_default( 'spark_config', 'spark_cluster') self.credential = credential if self.spark_runtime not in {'azure_synapse', 'databricks'}: raise RuntimeError( 'Only \'azure_synapse\' and \'databricks\' are currently supported.') elif self.spark_runtime == 'azure_synapse': # Feathr is a spark-based application so the feathr jar compiled from source code will be used in the # Spark job submission. The feathr jar hosted in cloud saves the time users needed to upload the jar from # their local env. self._FEATHR_JOB_JAR_PATH = \ envutils.get_environment_variable_with_default( 'spark_config', 'azure_synapse', 'feathr_runtime_location') if self.credential is None: self.credential = DefaultAzureCredential(exclude_interactive_browser_credential=False) self.feathr_spark_laucher = _FeathrSynapseJobLauncher( synapse_dev_url=envutils.get_environment_variable_with_default( 'spark_config', 'azure_synapse', 'dev_url'), pool_name=envutils.get_environment_variable_with_default( 'spark_config', 'azure_synapse', 'pool_name'), datalake_dir=envutils.get_environment_variable_with_default( 'spark_config', 'azure_synapse', 'workspace_dir'), executor_size=envutils.get_environment_variable_with_default( 'spark_config', 'azure_synapse', 'executor_size'), executors=envutils.get_environment_variable_with_default( 'spark_config', 'azure_synapse', 'executor_num'), credential=self.credential ) elif self.spark_runtime == 'databricks': # Feathr is a spark-based application so the feathr jar compiled from source code will be used in the # Spark job submission. The feathr jar hosted in cloud saves the time users needed to upload the jar from # their local env. self._FEATHR_JOB_JAR_PATH = \ envutils.get_environment_variable_with_default( 'spark_config', 'databricks', 'feathr_runtime_location') self.feathr_spark_laucher = _FeathrDatabricksJobLauncher( workspace_instance_url=envutils.get_environment_variable_with_default( 'spark_config', 'databricks', 'workspace_instance_url'), token_value=_EnvVaraibleUtil.get_environment_variable( 'DATABRICKS_WORKSPACE_TOKEN_VALUE'), config_template=envutils.get_environment_variable_with_default( 'spark_config', 'databricks', 'config_template'), databricks_work_dir=envutils.get_environment_variable_with_default( 'spark_config', 'databricks', 'work_dir') ) self._construct_redis_client() # initialize registry self.registry_delimiter = envutils.get_environment_variable_with_default( 'feature_registry', 'purview', 'delimiter') self.azure_purview_name = envutils.get_environment_variable_with_default( 'feature_registry', 'purview', 'purview_name') # initialize the registry no matter whether we set purview name or not, given some of the methods are used there. self.registry = _FeatureRegistry(self.project_name, self.azure_purview_name, self.registry_delimiter, project_registry_tag, config_path = config_path, credential=self.credential) def _check_required_environment_variables_exist(self): """Checks if the required environment variables(form feathr_config.yaml) is set. Some required information has to be set via environment variables so the client can work. """ for required_field in self.required_fields: if required_field not in os.environ: raise RuntimeError(f'{required_field} is not set in environment variable. All required environment ' f'variables are: {self.required_fields}.')
[docs] def register_features(self, from_context: bool = True): """Registers features based on the current workspace Args: from_context: If from_context is True (default), the features will be generated from the current context, with the previous built features in client.build(). Otherwise, the features will be generated from configuration files. """ if from_context: # make sure those items are in `self` if 'anchor_list' in dir(self) and 'derived_feature_list' in dir(self): _FeatureRegistry.save_to_feature_config_from_context(self.anchor_list, self.derived_feature_list, self.local_workspace_dir) self.registry.register_features(self.local_workspace_dir, from_context=from_context, anchor_list=self.anchor_list, derived_feature_list=self.derived_feature_list) else: raise RuntimeError("Please call FeathrClient.build_features() first in order to register features") else: self.registry.register_features(self.local_workspace_dir, from_context=from_context)
[docs] def build_features(self, anchor_list: List[FeatureAnchor] = [], derived_feature_list: List[DerivedFeature] = [], verbose: bool = False): """Build features based on the current workspace. all actions that triggers a spark job will be based on the result of this action. """ # Run necessary validations # anchor name and source name should be unique anchor_names = {} source_names = {} for anchor in anchor_list: if anchor.name in anchor_names: raise RuntimeError(f"Anchor name should be unique but there are duplicate anchor names in your anchor " f"definitions. Anchor name of {anchor} is already defined in {anchor_names[anchor.name]}") else: anchor_names[anchor.name] = anchor if anchor.source.name in source_names: raise RuntimeError(f"Source name should be unique but there are duplicate source names in your source " f"definitions. Source name of {anchor.source} is already defined in {source_names[anchor.source.name]}") else: source_names[anchor.source.name] = anchor.source preprocessingPyudfManager = _PreprocessingPyudfManager() _PreprocessingPyudfManager.build_anchor_preprocessing_metadata(anchor_list, self.local_workspace_dir) self.registry.save_to_feature_config_from_context(anchor_list, derived_feature_list, self.local_workspace_dir) self.anchor_list = anchor_list self.derived_feature_list = derived_feature_list # Pretty print anchor_list if verbose and self.anchor_list: FeaturePrinter.pretty_print_anchors(self.anchor_list)
[docs] def list_registered_features(self, project_name: str = None) -> List[str]: """List all the already registered features. If project_name is not provided or is None, it will return all the registered features; otherwise it will only return features under this project """ return self.registry.list_registered_features(project_name)
def _get_registry_client(self): """ Returns registry client in case users want to perform more advanced operations """ return self.registry._get_registry_client()
[docs] def get_online_features(self, feature_table, key, feature_names): """Fetches feature value for a certain key from a online feature table. Args: feature_table: the name of the feature table. key: the key of the entity feature_names: list of feature names to fetch Return: A list of feature values for this entity. It's ordered by the requested feature names. For example, feature_names = ['f_is_medium_trip_distance', 'f_day_of_week', 'f_day_of_month', 'f_hour_of_day'] then, the returned feature values is: [b'true', b'4.0', b'31.0', b'23.0']. If the feature_table or key doesn't exist, then a list of Nones are returned. For example, [None, None, None, None]. If a feature doesn't exist, then a None is returned for that feature. For example: [None, b'4.0', b'31.0', b'23.0']. """ redis_key = self._construct_redis_key(feature_table, key) res = self.redis_clint.hmget(redis_key, *feature_names) return self._decode_proto(res)
[docs] def multi_get_online_features(self, feature_table, keys, feature_names): """Fetches feature value for a list of keys from a online feature table. This is the batch version of the get API. Args: feature_table: the name of the feature table. keys: list of keys for the entities feature_names: list of feature names to fetch Return: A list of feature values for the requested entities. It's ordered by the requested feature names. For example, keys = [12, 24], feature_names = ['f_is_medium_trip_distance', 'f_day_of_week', 'f_day_of_month', 'f_hour_of_day'] then, the returned feature values is: {'12': [b'false', b'5.0', b'1.0', b'0.0'], '24': [b'true', b'4.0', b'31.0', b'23.0']}. If the feature_table or key doesn't exist, then a list of Nones are returned. For example, {'12': [None, None, None, None], '24': [None, None, None, None]} If a feature doesn't exist, then a None is returned for that feature. For example: {'12': [None, b'4.0', b'31.0', b'23.0'], '24': [b'true', b'4.0', b'31.0', b'23.0']}. """ with self.redis_clint.pipeline() as redis_pipeline: for key in keys: redis_key = self._construct_redis_key(feature_table, key) redis_pipeline.hmget(redis_key, *feature_names) pipeline_result = redis_pipeline.execute() decoded_pipeline_result = [] for feature_list in pipeline_result: decoded_pipeline_result.append(self._decode_proto(feature_list)) return dict(zip(keys, decoded_pipeline_result))
def _decode_proto(self, feature_list): """Decode the bytes(in string form) via base64 decoder. For dense array, it will be returned as Python List. For sparse array, it will be returned as tuple of index array and value array. The order of elements in the arrays won't be changed. """ typed_result = [] for raw_feature in feature_list: if raw_feature: feature_value = FeatureValue() decoded = base64.b64decode(raw_feature) feature_value.ParseFromString(decoded) if feature_value.WhichOneof('FeatureValueOneOf') == 'boolean_value': typed_result.append(feature_value.boolean_value) elif feature_value.WhichOneof('FeatureValueOneOf') == 'string_value': typed_result.append(feature_value.string_value) elif feature_value.WhichOneof('FeatureValueOneOf') == 'float_value': typed_result.append(feature_value.float_value) elif feature_value.WhichOneof('FeatureValueOneOf') == 'double_value': typed_result.append(feature_value.double_value) elif feature_value.WhichOneof('FeatureValueOneOf') == 'int_value': typed_result.append(feature_value.int_value) elif feature_value.WhichOneof('FeatureValueOneOf') == 'long_value': typed_result.append(feature_value.long_value) elif feature_value.WhichOneof('FeatureValueOneOf') == 'int_array': typed_result.append(feature_value.int_array.integers) elif feature_value.WhichOneof('FeatureValueOneOf') == 'string_array': typed_result.append(feature_value.string_array.strings) elif feature_value.WhichOneof('FeatureValueOneOf') == 'float_array': typed_result.append(feature_value.float_array.floats) elif feature_value.WhichOneof('FeatureValueOneOf') == 'double_array': typed_result.append(feature_value.double_array.doubles) elif feature_value.WhichOneof('FeatureValueOneOf') == 'boolean_array': typed_result.append(feature_value.boolean_array.booleans) elif feature_value.WhichOneof('FeatureValueOneOf') == 'sparse_string_array': typed_result.append((feature_value.sparse_string_array.index_integers, feature_value.sparse_string_array.value_strings)) elif feature_value.WhichOneof('FeatureValueOneOf') == 'sparse_bool_array': typed_result.append((feature_value.sparse_bool_array.index_integers, feature_value.sparse_bool_array.value_booleans)) elif feature_value.WhichOneof('FeatureValueOneOf') == 'sparse_float_array': typed_result.append((feature_value.sparse_float_array.index_integers, feature_value.sparse_float_array.value_floats)) elif feature_value.WhichOneof('FeatureValueOneOf') == 'sparse_double_array': typed_result.append((feature_value.sparse_double_array.index_integers, feature_value.sparse_double_array.value_doubles)) elif feature_value.WhichOneof('FeatureValueOneOf') == 'sparse_long_array': typed_result.append((feature_value.sparse_long_array.index_integers, feature_value.sparse_long_array.value_longs)) else: self.logger.debug("Fail to load the feature type. Maybe a new type that is not supported by this " "client version") self.logger.debug(f"The raw feature is {raw_feature}.") self.logger.debug(f"The loaded feature is {feature_value}") typed_result.append(None) else: typed_result.append(raw_feature) return typed_result def _clean_test_data(self, feature_table): """ WARNING: THIS IS ONLY USED FOR TESTING Clears a namespace in redis cache. This may be very time consuming. Args: feature_table: str, feature_table i.e your prefix before the separator in the Redis database. """ cursor = '0' ns_keys = feature_table + '*' while cursor != 0: # 5000 count at a scan seems reasonable faster for our testing data cursor, keys = self.redis_clint.scan( cursor=cursor, match=ns_keys, count=5000) if keys: self.redis_clint.delete(*keys) def _construct_redis_key(self, feature_table, key): return feature_table + self._KEY_SEPARATOR + key def _construct_redis_client(self): """Constructs the Redis client. The host, port, credential and other parameters can be set via environment parameters. """ password = _EnvVaraibleUtil.get_environment_variable(REDIS_PASSWORD) host = self.redis_host port = self.redis_port ssl_enabled = self.redis_ssl_enabled redis_clint = redis.Redis( host=host, port=port, password=password, ssl=ssl_enabled) self.logger.info('Redis connection is successful and completed.') self.redis_clint = redis_clint
[docs] def get_offline_features(self, observation_settings: ObservationSettings, feature_query: Union[FeatureQuery, List[FeatureQuery]], output_path: str, execution_configuratons: Union[SparkExecutionConfiguration ,Dict[str,str]] = None, udf_files = None, verbose: bool = False ): """ Get offline features for the observation dataset Args: observation_settings: settings of the observation data, e.g. timestamp columns, input path, etc. feature_query: features that are requested to add onto the observation data output_path: output path of job, i.e. the observation data with features attached. execution_configuratons: a dict that will be passed to spark job when the job starts up, i.e. the "spark configurations". Note that not all of the configuration will be honored since some of the configurations are managed by the Spark platform, such as Databricks or Azure Synapse. Refer to the [spark documentation](https://spark.apache.org/docs/latest/configuration.html) for a complete list of spark configurations. """ feature_queries = feature_query if isinstance(feature_query, List) else [feature_query] feature_names = [] for feature_query in feature_queries: for feature_name in feature_query.feature_list: feature_names.append(feature_name) udf_files = _PreprocessingPyudfManager.prepare_pyspark_udf_files(feature_names, self.local_workspace_dir) # produce join config tm = Template(""" {{observation_settings.to_feature_config()}} featureList: [ {% for list in feature_lists %} {{list.to_feature_config()}} {% endfor %} ] outputPath: "{{output_path}}" """) config = tm.render(feature_lists=feature_queries, observation_settings=observation_settings, output_path=output_path) config_file_name = "feature_join_conf/feature_join.conf" config_file_path = os.path.join(self.local_workspace_dir, config_file_name) # make sure `FeathrClient.build_features()` is called before getting offline features/materialize features # otherwise users will be confused on what are the available features # in build_features it will assign anchor_list and derived_feature_list variable, hence we are checking if those two variables exist to make sure the above condition is met if 'anchor_list' in dir(self) and 'derived_feature_list' in dir(self): _FeatureRegistry.save_to_feature_config_from_context(self.anchor_list, self.derived_feature_list, self.local_workspace_dir) else: raise RuntimeError("Please call FeathrClient.build_features() first in order to get offline features") # Pretty print feature_query if verbose and feature_query: FeaturePrinter.pretty_print_feature_query(feature_query) write_to_file(content=config, full_file_name=config_file_path) return self._get_offline_features_with_config(config_file_path, execution_configuratons, udf_files=udf_files)
def _get_offline_features_with_config(self, feature_join_conf_path='feature_join_conf/feature_join.conf', execution_configuratons: Dict[str,str] = None, udf_files=[]): """Joins the features to your offline observation dataset based on the join config. Args: feature_join_conf_path: Relative path to your feature join config file. """ cloud_udf_paths = [self.feathr_spark_laucher.upload_or_get_cloud_path(udf_local_path) for udf_local_path in udf_files] feathr_feature = ConfigFactory.parse_file(feature_join_conf_path) feature_join_job_params = FeatureJoinJobParams(join_config_path=os.path.abspath(feature_join_conf_path), observation_path=feathr_feature['observationPath'], feature_config=os.path.join(self.local_workspace_dir, 'feature_conf/'), job_output_path=feathr_feature['outputPath'], ) job_tags = {OUTPUT_PATH_TAG:feature_join_job_params.job_output_path} # set output format in job tags if it's set by user, so that it can be used to parse the job result in the helper function if execution_configuratons is not None and OUTPUT_FORMAT in execution_configuratons: job_tags[OUTPUT_FORMAT]= execution_configuratons[OUTPUT_FORMAT] ''' - Job tags are for job metadata and it's not passed to the actual spark job (i.e. not visible to spark job), more like a platform related thing that Feathr want to add (currently job tags only have job output URL and job output format, ). They are carried over with the job and is visible to every Feathr client. Think this more like some customized metadata for the job which would be weird to be put in the spark job itself. - Job arguments (or sometimes called job parameters)are the arguments which are command line arguments passed into the actual spark job. This is usually highly related with the spark job. In Feathr it's like the input to the scala spark CLI. They are usually not spark specific (for example if we want to specify the location of the feature files, or want to - Job configuration are like "configurations" for the spark job and are usually spark specific. For example, we want to control the no. of write parts for spark Job configurations and job arguments (or sometimes called job parameters) have quite some overlaps (i.e. you can achieve the same goal by either using the job arguments/parameters vs. job configurations). But the job tags should just be used for metadata purpose. ''' # submit the jars return self.feathr_spark_laucher.submit_feathr_job( job_name=self.project_name + '_feathr_feature_join_job', main_jar_path=self._FEATHR_JOB_JAR_PATH, python_files=cloud_udf_paths, job_tags=job_tags, main_class_name='com.linkedin.feathr.offline.job.FeatureJoinJob', arguments=[ '--join-config', self.feathr_spark_laucher.upload_or_get_cloud_path( feature_join_job_params.join_config_path), '--input', feature_join_job_params.observation_path, '--output', feature_join_job_params.job_output_path, '--feature-config', self.feathr_spark_laucher.upload_or_get_cloud_path( feature_join_job_params.feature_config), '--num-parts', self.output_num_parts, '--s3-config', self._get_s3_config_str(), '--adls-config', self._get_adls_config_str(), '--blob-config', self._get_blob_config_str(), '--sql-config', self._get_sql_config_str(), '--snowflake-config', self._get_snowflake_config_str() ], reference_files_path=[], configuration=execution_configuratons )
[docs] def get_job_result_uri(self, block=True, timeout_sec=300) -> str: """Gets the job output URI """ if not block: return self.feathr_spark_laucher.get_job_result_uri() # Block the API by pooling the job status and wait for complete if self.feathr_spark_laucher.wait_for_completion(timeout_sec): return self.feathr_spark_laucher.get_job_result_uri() else: raise RuntimeError( 'Spark job failed so output cannot be retrieved.')
[docs] def get_job_tags(self) -> Dict[str, str]: """Gets the job tags """ return self.feathr_spark_laucher.get_job_tags()
def wait_job_to_finish(self, timeout_sec: int = 300): """Waits for the job to finish in a blocking way unless it times out """ if self.feathr_spark_laucher.wait_for_completion(timeout_sec): return else: raise RuntimeError('Spark job failed.')
[docs] def materialize_features(self, settings: MaterializationSettings, execution_configuratons: Union[SparkExecutionConfiguration ,Dict[str,str]] = None, verbose: bool = False): """Materialize feature data Args: settings: Feature materialization settings execution_configuratons: a dict that will be passed to spark job when the job starts up, i.e. the "spark configurations". Note that not all of the configuration will be honored since some of the configurations are managed by the Spark platform, such as Databricks or Azure Synapse. Refer to the [spark documentation](https://spark.apache.org/docs/latest/configuration.html) for a complete list of spark configurations. """ # produce materialization config for end in settings.get_backfill_cutoff_time(): settings.backfill_time.end = end config = _to_materialization_config(settings) config_file_name = "feature_gen_conf/auto_gen_config_{}.conf".format(end.timestamp()) config_file_path = os.path.join(self.local_workspace_dir, config_file_name) write_to_file(content=config, full_file_name=config_file_path) # make sure `FeathrClient.build_features()` is called before getting offline features/materialize features in the python SDK # otherwise users will be confused on what are the available features # in build_features it will assign anchor_list and derived_feature_list variable, hence we are checking if those two variables exist to make sure the above condition is met if 'anchor_list' in dir(self) and 'derived_feature_list' in dir(self): _FeatureRegistry.save_to_feature_config_from_context(self.anchor_list, self.derived_feature_list, self.local_workspace_dir) else: raise RuntimeError("Please call FeathrClient.build_features() first in order to materialize the features") udf_files = _PreprocessingPyudfManager.prepare_pyspark_udf_files(settings.feature_names, self.local_workspace_dir) # CLI will directly call this so the experiene won't be broken self._materialize_features_with_config(config_file_path, execution_configuratons, udf_files) if os.path.exists(config_file_path): os.remove(config_file_path) # Pretty print feature_names of materialized features if verbose and settings: FeaturePrinter.pretty_print_materialize_features(settings)
def _materialize_features_with_config(self, feature_gen_conf_path: str = 'feature_gen_conf/feature_gen.conf',execution_configuratons: Dict[str,str] = None, udf_files=[]): """Materializes feature data based on the feature generation config. The feature data will be materialized to the destination specified in the feature generation config. Args feature_gen_conf_path: Relative path to the feature generation config you want to materialize. """ cloud_udf_paths = [self.feathr_spark_laucher.upload_or_get_cloud_path(udf_local_path) for udf_local_path in udf_files] # Read all features conf generation_config = FeatureGenerationJobParams( generation_config_path=os.path.abspath(feature_gen_conf_path), feature_config=os.path.join(self.local_workspace_dir, "feature_conf/")) ''' - Job tags are for job metadata and it's not passed to the actual spark job (i.e. not visible to spark job), more like a platform related thing that Feathr want to add (currently job tags only have job output URL and job output format, ). They are carried over with the job and is visible to every Feathr client. Think this more like some customized metadata for the job which would be weird to be put in the spark job itself. - Job arguments (or sometimes called job parameters)are the arguments which are command line arguments passed into the actual spark job. This is usually highly related with the spark job. In Feathr it's like the input to the scala spark CLI. They are usually not spark specific (for example if we want to specify the location of the feature files, or want to - Job configuration are like "configurations" for the spark job and are usually spark specific. For example, we want to control the no. of write parts for spark Job configurations and job arguments (or sometimes called job parameters) have quite some overlaps (i.e. you can achieve the same goal by either using the job arguments/parameters vs. job configurations). But the job tags should just be used for metadata purpose. ''' optional_params = [] if _EnvVaraibleUtil.get_environment_variable('KAFKA_SASL_JAAS_CONFIG'): optional_params = optional_params + ['--kafka-config', self._get_kafka_config_str()] return self.feathr_spark_laucher.submit_feathr_job( job_name=self.project_name + '_feathr_feature_materialization_job', main_jar_path=self._FEATHR_JOB_JAR_PATH, python_files=cloud_udf_paths, main_class_name='com.linkedin.feathr.offline.job.FeatureGenJob', arguments=[ '--generation-config', self.feathr_spark_laucher.upload_or_get_cloud_path( generation_config.generation_config_path), # Local Config, comma seperated file names '--feature-config', self.feathr_spark_laucher.upload_or_get_cloud_path( generation_config.feature_config), '--redis-config', self._getRedisConfigStr(), '--s3-config', self._get_s3_config_str(), '--adls-config', self._get_adls_config_str(), '--blob-config', self._get_blob_config_str(), '--sql-config', self._get_sql_config_str(), '--snowflake-config', self._get_snowflake_config_str() ] + optional_params, reference_files_path=[], configuration=execution_configuratons, )
[docs] def wait_job_to_finish(self, timeout_sec: int = 300): """Waits for the job to finish in a blocking way unless it times out """ if self.feathr_spark_laucher.wait_for_completion(timeout_sec): return else: raise RuntimeError('Spark job failed.')
def _getRedisConfigStr(self): """Construct the Redis config string. The host, port, credential and other parameters can be set via environment variables.""" password = _EnvVaraibleUtil.get_environment_variable(REDIS_PASSWORD) host = self.redis_host port = self.redis_port ssl_enabled = self.redis_ssl_enabled config_str = """ REDIS_PASSWORD: "{REDIS_PASSWORD}" REDIS_HOST: "{REDIS_HOST}" REDIS_PORT: {REDIS_PORT} REDIS_SSL_ENABLED: {REDIS_SSL_ENABLED} """.format(REDIS_PASSWORD=password, REDIS_HOST=host, REDIS_PORT=port, REDIS_SSL_ENABLED=ssl_enabled) return config_str def _get_s3_config_str(self): """Construct the S3 config string. The endpoint, access key, secret key, and other parameters can be set via environment variables.""" endpoint = self.s3_endpoint # if s3 endpoint is set in the feathr_config, then we need other environment variables # keys can't be only accessed through environment access_key = _EnvVaraibleUtil.get_environment_variable('S3_ACCESS_KEY') secret_key = _EnvVaraibleUtil.get_environment_variable('S3_SECRET_KEY') # HOCCON format will be parsed by the Feathr job config_str = """ S3_ENDPOINT: {S3_ENDPOINT} S3_ACCESS_KEY: "{S3_ACCESS_KEY}" S3_SECRET_KEY: "{S3_SECRET_KEY}" """.format(S3_ENDPOINT=endpoint, S3_ACCESS_KEY=access_key, S3_SECRET_KEY=secret_key) return config_str def _get_adls_config_str(self): """Construct the ADLS config string for abfs(s). The Account, access key and other parameters can be set via environment variables.""" account = _EnvVaraibleUtil.get_environment_variable('ADLS_ACCOUNT') # if ADLS Account is set in the feathr_config, then we need other environment variables # keys can't be only accessed through environment key = _EnvVaraibleUtil.get_environment_variable('ADLS_KEY') # HOCCON format will be parsed by the Feathr job config_str = """ ADLS_ACCOUNT: {ADLS_ACCOUNT} ADLS_KEY: "{ADLS_KEY}" """.format(ADLS_ACCOUNT=account, ADLS_KEY=key) return config_str def _get_blob_config_str(self): """Construct the Blob config string for wasb(s). The Account, access key and other parameters can be set via environment variables.""" account = _EnvVaraibleUtil.get_environment_variable('BLOB_ACCOUNT') # if BLOB Account is set in the feathr_config, then we need other environment variables # keys can't be only accessed through environment key = _EnvVaraibleUtil.get_environment_variable('BLOB_KEY') # HOCCON format will be parsed by the Feathr job config_str = """ BLOB_ACCOUNT: {BLOB_ACCOUNT} BLOB_KEY: "{BLOB_KEY}" """.format(BLOB_ACCOUNT=account, BLOB_KEY=key) return config_str def _get_sql_config_str(self): """Construct the SQL config string for jdbc. The dbtable (query), user, password and other parameters can be set via environment variables.""" table = _EnvVaraibleUtil.get_environment_variable('JDBC_TABLE') user = _EnvVaraibleUtil.get_environment_variable('JDBC_USER') password = _EnvVaraibleUtil.get_environment_variable('JDBC_PASSWORD') driver = _EnvVaraibleUtil.get_environment_variable('JDBC_DRIVER') auth_flag = _EnvVaraibleUtil.get_environment_variable('JDBC_AUTH_FLAG') token = _EnvVaraibleUtil.get_environment_variable('JDBC_TOKEN') # HOCCON format will be parsed by the Feathr job config_str = """ JDBC_TABLE: {JDBC_TABLE} JDBC_USER: {JDBC_USER} JDBC_PASSWORD: {JDBC_PASSWORD} JDBC_DRIVER: {JDBC_DRIVER} JDBC_AUTH_FLAG: {JDBC_AUTH_FLAG} JDBC_TOKEN: {JDBC_TOKEN} """.format(JDBC_TABLE=table, JDBC_USER=user, JDBC_PASSWORD=password, JDBC_DRIVER = driver, JDBC_AUTH_FLAG = auth_flag, JDBC_TOKEN = token) return config_str def _get_snowflake_config_str(self): """Construct the Snowflake config string for jdbc. The url, user, role and other parameters can be set via yaml config. Password can be set via environment variables.""" sf_url = self.envutils.get_environment_variable_with_default('offline_store', 'snowflake', 'url') sf_user = self.envutils.get_environment_variable_with_default('offline_store', 'snowflake', 'user') sf_role = self.envutils.get_environment_variable_with_default('offline_store', 'snowflake', 'role') sf_password = self.envutils.get_environment_variable('JDBC_SF_PASSWORD') # HOCCON format will be parsed by the Feathr job config_str = """ JDBC_SF_URL: {JDBC_SF_URL} JDBC_SF_USER: {JDBC_SF_USER} JDBC_SF_ROLE: {JDBC_SF_ROLE} JDBC_SF_PASSWORD: {JDBC_SF_PASSWORD} """.format(JDBC_SF_URL=sf_url, JDBC_SF_USER=sf_user, JDBC_SF_PASSWORD=sf_password, JDBC_SF_ROLE=sf_role) return config_str def _get_kafka_config_str(self): """Construct the Kafka config string. The endpoint, access key, secret key, and other parameters can be set via environment variables.""" sasl = _EnvVaraibleUtil.get_environment_variable('KAFKA_SASL_JAAS_CONFIG') # HOCCON format will be parsed by the Feathr job config_str = """ KAFKA_SASL_JAAS_CONFIG: "{sasl}" """.format(sasl=sasl) return config_str
[docs] def get_features_from_registry(self, project_name: str) -> Dict[str, FeatureBase]: """ Get feature from registry by project name. The features got from registry are automatically built. """ registry_anchor_list, registry_derived_feature_list = self.registry.get_features_from_registry(project_name) self.build_features(registry_anchor_list, registry_derived_feature_list) feature_dict = {} # add those features into a dict for easier lookup for anchor in registry_anchor_list: for feature in anchor.features: feature_dict[feature.name] = feature for feature in registry_derived_feature_list: feature_dict[feature.name] = feature return feature_dict