Source code for mvbep.mvbep

"""
Measurement and Verification Building Energy Prediction (MVBEP) is a ``class`` that encompasses different
modules for reading and validating input data to transforming such data and using them to develop regression
models for savings estimations in the post-retrofit period. 

The ``class`` is fitted by using :py:meth:`~mvbep.mvbep.MVBEP.fit_training()` which takes in the required input data. Followingly, an
initialization summary is produced to check the data sufficiency requirements or the need for any actions
to fix the input data. If the data met the requirements to build a model, the function :py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`
is used to transform the data, train, and evaluate regression models. :py:meth:`~mvbep.mvbep.MVBEP.generate_development_summary()` function 
can be used to see the summary of the development process. Finally, savings are estimated passing using post-retrofit
data to :py:meth:`~mvbep.mvbep.MVBEP.predict_energy_consumption()` function. The current state of the documentation covers only :py:class:`~mvbep.mvbep.MVBEP` class.
Future additions to the project includes writing the documentation for the remaining modules (i.e. :py:class:`~mvbep.Initializer.initializer`,
:py:class:`~mvbep.transformer.Transformer`, :py:class:`~mvbep.developer.Developer`, :py:class:`~mvbep.interpreter.Interpreter`, and :py:class:`~mvbep.writer.Writer`).

Please check the provided Notebooks for the package demonstration.

"""


from datetime import datetime
import tracemalloc
import pandas as pd

from .initializer import Initializer
from .transformer import Transformer
from .developer import Developer

import sys

from .writer import GenerateInitializationSummary, GenerateMVBEPSummary, GenerateQuantificationSummary
from .interpreter import return_interpretation_data

import sys
import os
import joblib


[docs]class MVBEP: """ MVBEP class to perform all steps of building an energy consumption baseline. The class incorporates the 4 required modules for building a baseline starting from initialization to savings quantification. Parameters ---------- mvbep_state_path : str, default 'None' The file path for a saved MVBEP state in case the baseline creation process stopped before the final step and saved by :py:meth:`~mvbep.mvbep.MVBEP.save_mvbep_state()` . Example ---------- In case a object of MVBEP was saved by using :py:meth:`~mvbep.mvbep.MVBEP.save_mvbep_state()`, it can be loaded like >>> mvbep_boulder = MVBEP(mvbep_state_path = 'mvbep_states/office-boulder_mvbep-state') In case there was no object saved before, an instance of MVBEP is created by >>> mvbep_boulder = MVBEP() """ def __init__(self, mvbep_state_path:str=None): self.mvbep_state = { 'mvbep':{ 'date' : str(datetime.now().strftime("%Y%m%d-%H%M%S")), 'version' : 1.0, 'best_model' : None, 'development_state': 'NOT INITIATED' }, 'initializer':{ 'cleaned_data' : None, 'frequency' : None, 'features' : None, 'country_code' : None, 'df_validation' : None, 'data_sufficiency' : None, 'occupancy_schedule' : None, 'df_timestamps_highlights' : None }, 'transformer':{ 'mvbep_frequency' : None, 'design_matrices_features' : None }, 'developer':{ 'training_inputs':{ 'modeling_methods' : None, 'test_size' : None, 'hyperparameter_tuning' : None, 'ranking_method' : None }, 'training_outputs':{ 'training_summary': None, 'frequency':{ '15-min':{ 'models_dict' : None, 'summary':{ 'evaluation': None, 'plot_data' : None } }, 'hourly':{ 'models_dict' : None, 'summary':{ 'evaluation': None, 'plot_data' : None } }, 'daily': { 'models_dict' : None, 'summary':{ 'evaluation': None, 'plot_data' : None } } } } } } if mvbep_state_path is not None: with open(mvbep_state_path, 'rb') as f: self.mvbep_state = joblib.load(f) else: pass
[docs] def fit_training(self, data:pd.DataFrame, frequency:str, country_code:str = None, occupancy_schedule:dict = None, mismatch_date_threshold:float = 0.3, total_missing = None, max_consec_missing = None, n_days = 360 ): """ Fits a MVBEP object with raw data. This is the first method in developing an energy consumption baseline. The method takes required historical data to prepare them for next processes. Parameters ---------- data : pd.DataFrame A dataframe that includes the required data which includes at least - Timestamps in 15-min or hourly intervals - Energy consumption - Outdoor dry-bulb temperature frequency : str, {'15-min', 'hourly'} The timestamps intervals frequency. country_code : str, default to 'None' A two-letter ``str`` indicating the country code in which the building resides. The supported codes are listed in holiday package `documentation <https://pypi.org/project/holidays/>`_ occupancy_schedule : dict, default to 'None' A ``dict`` indicating the general occupancy density in the building. [Check the parameter structure ](??) mismatch_date_threshold : float, default to 0.3 Sets the threshold for values in ``timestamp`` column that cannot be converted from ``str`` to ``datetime`` object. total_missing : int, default to 'None' Sets a threshold for the total number of a feature's missing observations to meet data sufficiency requirements. The default value is set based on frequency. max_consec_missing : int, default to 'None' Sets a threshold for consecutive missing observations in a single feature before the feature is dropped. The default value is set based on frequency. n_days : int, default to 365 Sets a threshold for the least number of days in ``data`` . Example ---------- Example of a building located in Boulder, CO, USA with hourly timestamps. The instance of MVBEP was created with a nmae of ``mvbep_boulder`` . >>> mvbep_boulder.fit_training( ... data = df_boulder_office, ... frequency = 'hourly', ... country_code = 'US' ... ) """ # Checking the MVBEP object state if self.mvbep_state['mvbep']['development_state'] != 'NOT INITIATED': if self.mvbep_state['mvbep']['development_state'] == 'DEVELOPED': print("The MVBEP started the initiation process and finished developing a MVBEP model. To predict baseline values run predict_energy_consumption()") sys.exit() else: print("The MVBEP started the initiation but didn't start developing a MVBEP model. To start development run develop_mvbep()") sys.exit() # Creating an initializer and validating the passed data initializer = Initializer(mismatch_date_threshold = mismatch_date_threshold) initializer.fit(data = data, frequency = frequency, country_code = country_code, occupancy_schedule = occupancy_schedule ) initializer.validate(total_missing = total_missing, max_consec_missing = max_consec_missing, n_days = n_days ) # Updating the MVBEP object state if initializer.data_sufficiency: self.mvbep_state['mvbep']['development_state'] = 'INITIATED' else: self.mvbep_state['mvbep']['development_state'] = 'FAILED INITIATION' self.mvbep_state['initializer']['cleaned_data'] = initializer.df_fin self.mvbep_state['initializer']['frequency'] = initializer.frequency self.mvbep_state['initializer']['features'] = initializer.features self.mvbep_state['initializer']['country_code'] = initializer.country_code self.mvbep_state['initializer']['occupancy_schedule'] = initializer.occupancy_schedule self.mvbep_state['initializer']['df_validation'] = initializer.df_validation self.mvbep_state['initializer']['data_sufficiency'] = initializer.data_sufficiency self.mvbep_state['initializer']['df_timestamps_highlights'] = initializer.df_timestamps_highlights
[docs] def generate_initialization_summary(self, file_name:str = None ): """ Generates summary of the initialization performed after :py:meth:`~mvbep.mvbep.MVBEP.fit_training()` . The initialization summary is generated as an HTML file with highlights of the initialization process including plots, descriptive data, and data sufficiency result. Parameters ---------- file_name : str, default to 'None' Sets the name of the HTML initialization summary. In case no name was provided, the resulting name will be ``initiation_time`` + ``init_sum_`` . Example ---------- Writing the initialization summary of ``mvbep_boulder`` after running :py:meth:`~mvbep.mvbep.MVBEP.fit_training()` . >>> mvbep_boulder.generate_initialization_summary(file_name = 'mvbep_summaries/office-boulder_init-summary') """ if self.mvbep_state['mvbep'] == 'NOT INITIATED': print('The MVBEP object has not been initiated. Initiate the model using fit_training().') sys.exit() elif self.mvbep_state['mvbep']['development_state'] == 'FAILED INITIATION': print("The MVBEP object failed the initiation process. Fix the data and run fit_training()") sys.exit() else: GenerateInitializationSummary(file_name = file_name, df_input = self.mvbep_state['initializer']['cleaned_data'], frequency = self.mvbep_state['initializer']['frequency'], features = self.mvbep_state['initializer']['features'], df_timestamps_highlights = self.mvbep_state['initializer']['df_timestamps_highlights'], df_validation = self.mvbep_state['initializer']['df_validation'], data_sufficiency = self.mvbep_state['initializer']['data_sufficiency'])
[docs] def develop_mvbep(self, modeling_methods:dict = None, test_size:float = 0.2, hyperparameter_tuning:bool = False, ranking_method:str = 'min_cvrmse' ): """ Transforms the cleaned data and develops regression models. Takes the cleaned data after :py:meth:`~mvbep.mvbep.MVBEP.fit_training()` and iterates over the possible transformations while using each transformation to generate regression models using the chosen modeling approaches in ``modeling_methods`` . With each transformation, outputs such as evaluation metrics and models are saved in the MVBEP object's state (i.e. attribute ``mvbep_state`` ). Parameters ---------- modeling_methods : dict, default to 'None' The chosen modeling approaches to develop the baseline. In case None was passed, the argument is passed by: >>> default_modeling_methods = { ... 'LR' : True, # TOWT (If the frequency is hourly otherwise it is WLS) ... 'RF' : True, # Random Regression Forest ... 'XGB': True, # Extreme Gradient Boosting ... 'SVR': True, # Support Vector Regressor ... 'SLP': True, # Feed Forward Neural Network ... 'KNN': True # K-Nearest Neighbor ... } test_size : float, default to 0.2 Sets the testing set size out of the input data. hyperparameter_tuning : bool, defalut to ``False`` - If ``True`` : the hyperparameter tuning process is performed for any model with hyperparameters to be tuned. - If ``False`` : No hyperparameter tuning process is performed (except for KNN). ranking_method : str, {'min_cvrmse', 'min_nmbe'}, default to 'min_cvrmse' Sets the ranking method to choose the best model based on the testing set evaluation. - If ``min_cvrmse`` : The best model is selected based on Coefficient of Variation of Root Mean Squared Error (CV(RMSE)) - If 'min_nmbe': The best model is selected based on Normalized Mean Bias Error (NMBE). Example ---------- Developing ``mvbep_boulder`` after running :py:meth:`~mvbep.mvbep.MVBEP.fit_training()` . >>> mvbep_boulder.develop_mvbep() """ # Checking initialization result if self.mvbep_state['mvbep']['development_state'] == 'NOT INITIATED': print("The MVBEP object didn't start the initiation process. Run fit_training()") sys.exit() elif self.mvbep_state['mvbep']['development_state'] == 'FAILED INITIATION': print("The MVBEP object failed the initiation process. Fix the data and run fit_training()") sys.exit() elif self.mvbep_state['mvbep']['development_state'] == 'DEVELOPED': print("The MVBEP started the initiation process and finished developing a MVBEP model. To predict baseline values run predict_energy_consumption()") sys.exit() # Updating the MVBEP object state self.mvbep_state['developer']['training_inputs']['test_size'] = test_size self.mvbep_state['developer']['training_inputs']['hyperparameter_tuning'] = hyperparameter_tuning self.mvbep_state['developer']['training_inputs']['ranking_method'] = ranking_method # Determine possible downsampling downsamplings = [] training_frequency = [] freq_features_dict = {'15-min':None , 'hourly':None, 'daily':None, 'towt':None} if self.mvbep_state['initializer']['frequency'] == '15-min': downsamplings = [None, '15-min~hourly', '15-min~daily'] training_frequency = ['15-min','hourly', 'daily'] elif self.mvbep_state['initializer']['frequency'] == 'hourly': downsamplings = [None, 'hourly~daily'] training_frequency = ['hourly', 'daily'] else: downsamplings = [None] training_frequency = ['daily'] # MVBEP model development with different frequencies for downsample, freq in zip(downsamplings, training_frequency): #Transformation transformer = Transformer() transformer.fit(data = self.mvbep_state['initializer']['cleaned_data'], timestamp_frequency = self.mvbep_state['initializer']['frequency'], optional_features = self.mvbep_state['initializer']['features'], occupancy_schedule = self.mvbep_state['initializer']['occupancy_schedule'], country_code = self.mvbep_state['initializer']['country_code'], downsample_from_to = downsample) transformer.transform() freq_features_dict[freq] = transformer.design_matrix_features if freq == 'hourly': freq_features_dict['towt'] = transformer.towt_design_matrix_features #Training, hyperparameter tuning, and testing developer = Developer(modeling_methods = modeling_methods, test_size = test_size, hyperparameter_tuning = hyperparameter_tuning, ranking_method = ranking_method) developer.fit(data = transformer.df_fin, timestamp_frequency = freq, towt_design_matrix = transformer.towt_design_matrix, design_matrix_features = transformer.design_matrix_features, towt_design_matrix_features = transformer.towt_design_matrix_features ) #Saving MVBEP results self.mvbep_state['developer']['training_inputs']['self.modeling_methods'] = developer.modeling_methods self.mvbep_state['developer']['training_outputs']['frequency'][freq]['models_dict'] = developer.models_dict self.mvbep_state['developer']['training_outputs']['frequency'][freq]['summary']['evaluation'] = developer.show_evaluation_metrics() self.mvbep_state['developer']['training_outputs']['frequency'][freq]['summary']['plot_data'] = developer.return_plot_data() # Choosing the best model and best frequency # Summarizing outputs of each downsample iterations dfs_eval = [] for freq, freq_dict in self.mvbep_state['developer']['training_outputs']['frequency'].items(): if freq_dict['models_dict'] is not None: df_eval = freq_dict['summary']['evaluation'].loc[:, ['train_cvrmse', 'train_nmbe', 'test_cvrmse', 'test_nmbe']].reset_index() df_eval['frequency'] = freq dfs_eval.append(df_eval) training_summary = pd.concat(dfs_eval).reset_index(drop=True) #Saving development state self.mvbep_state['transformer']['design_matrices_features'] = freq_features_dict self.mvbep_state['developer']['training_outputs']['training_summary'] = training_summary condition_col = 'test_cvrmse' if ranking_method == 'min_cvrmse' else 'test_nmbe' self.mvbep_state['mvbep']['best_model'] = training_summary.sort_values(by=condition_col, key=abs).reset_index(drop=True)['models'][0] self.mvbep_state['transformer']['mvbep_frequency'] = training_summary.sort_values(by=condition_col, key=abs).reset_index(drop=True)['frequency'][0] self.mvbep_state['mvbep']['development_state'] = 'DEVELOPED'
[docs] def generate_development_summary(self, file_name:str = None ): """ Generates development summary after using :py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`. Outputs an HTML file that summarizes the development process after running :py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`. Parameters ---------- file_name : str, default to 'None' Sets the name of the HTML development summary. In case no name was provided, the resulting name will be ``initiation_time`` + ``dev_sum_``. Example ------- Writing the initialization summary of `mvbep_boulder` after running :py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`. >>> mvbep_boulder.generate_development_summary(file_name = 'mvbep_summaries/office-boulder_dev-summary') """ if self.mvbep_state['mvbep']['development_state'] != 'DEVELOPED': print('The MVBEP object has not been developed. Develop the model using develop_mvbep.') sys.exit() else: GenerateMVBEPSummary(file_name = file_name, mvbep_state = self.mvbep_state)
[docs] def save_mvbep_state(self, file_name:str=None): """ Saves the current progress of the MVBEP object by storing ``mvbep_state``. Parameters ---------- file_name : str, default to 'None' Sets the name of the ``Joblib`` state file. In case no name was provided, the resulting name will be ``initiation_time`` + ``mvbep_state``. Example ---------- Saving the state of either an initiated MVBEP by :py:meth:`~mvbep.mvbep.MVBEP.fit_training()` or a developed one by :py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`. >>> mvbep.save_state('mvbep_states/office-boulder_mvbep-state') """ file_name = datetime.now().strftime("%Y%m%d-%H%M%S")+'_mvbep_state' if file_name is None else file_name with open(file_name, 'wb') as f: joblib.dump(self.mvbep_state, f, compress=5)
[docs] def predict_energy_consumption(self, data:pd.DataFrame, generate_summary:bool = False, file_name:str = None, mismatch_date_threshold = 0.3, total_missing = None, max_consec_missing = None ): """ Generates savings quantification summary after using :py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`. Outputs an HTML file that summarizes the quantification process after running :py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`. The quantification process requires post-retrofit data that matches the same frequency and features of the data used in initialization when running :py:meth:`~mvbep.mvbep.MVBEP.fit_training()`. Features that was dropped in the initialization process are not required in the post-retrofit data. To see which features passed the initialization process, check the output of :py:meth:`~mvbep.mvbep.MVBEP.generate_initialization_summary()`. Parameters ---------- data : pd.DataFrame The post-retrofit data. generate_summary :bool, default to False Either generates a summary in an HTML file or return a ``list`` of baseline energy consumption. In case the passed ``data`` does not meet the requirements, an initialization summary is generated regardless of the passed argument in ``generate_summary``. - If ``True``: A quantification summary is provided. The function does not return any object. - If ``False``: A list of baseline energy consumption for the provided post-retrofit period is returend. file_name : str, default to 'None' Sets the name of the HTML quantification summary. In case no name was provided, the resulting name will be ``initiation_time`` + ``quant_sum_``. mismatch_date_threshold : float, default to 0.3 Sets the threshold for values in `timestamp` column that cannot be converted from ``str`` to ``pd.datetime`` object. total_missing : int, default to 'None' Sets a threshold for the total number of a feature's missing observations to meet data sufficiency requirements. The value is set based on frequency. max_consec_missing : int, default to 'None' Sets a threshold for consecutive missing observations in a single feature before the feature is dropped. The value is set based on frequency. Example ---------- Writing the quantification summary of ``mvbep_boulder``. >>> mvbep_boulder.predict_energy_consumption(data = df_boulder_post_retrofit, ... generate_summary = True, ... file_name='mvbep_summaries/office-boulder_dev-summary') """ # Check MVBEP state if self.mvbep_state['mvbep']['development_state'] != 'DEVELOPED': print('The MVBEP object has not been developed. Develop the model using develop_mvbep.') sys.exit() # Initialization mismatch_date_threshold initializer = Initializer(mvbep_state = self.mvbep_state, mismatch_date_threshold = mismatch_date_threshold ) initializer.validate_pred_data(data = data, total_missing = total_missing, max_consec_missing = max_consec_missing ) # Transformation if initializer.initializer_state != 'INITIATED': print('The post-retrofit data failed the initialization process. Check the initialization summary.') GenerateQuantificationSummary(mvbep_state = self.mvbep_state, df_init = initializer.df_fin, df_savings = None, df_timestamps_highlights = initializer.df_timestamps_highlights, df_validation = initializer.df_validation, data_sufficiency = initializer.data_sufficiency, file_name = file_name) sys.exit() transformer = Transformer() data_frequency = initializer.frequency best_frequency = self.mvbep_state['transformer']['mvbep_frequency'] downsample = None if best_frequency == data_frequency else data_frequency+'~'+best_frequency transformer.fit(data = initializer.df_fin, timestamp_frequency = initializer.frequency, optional_features = initializer.features, occupancy_schedule = initializer.occupancy_schedule, country_code = initializer.country_code, downsample_from_to = downsample) transformer.transform() # Predictions best_model = self.mvbep_state['mvbep']['best_model'] pred_pipeline = self.mvbep_state['developer']['training_outputs']['frequency'][best_frequency]['models_dict'][best_model]['model']['pipe'] if best_model == 'LR_towt': prediction_features = self.mvbep_state['transformer']['design_matrices_features']['towt'] baseline_pred = pred_pipeline.predict(transformer.towt_design_matrix.loc[:, prediction_features]) else: prediction_features = self.mvbep_state['transformer']['design_matrices_features'][best_frequency] baseline_pred = pred_pipeline.predict(transformer.df_fin.loc[:, self.mvbep_state['transformer']['design_matrices_features'][best_frequency]]) # Savings df_savings = transformer.df_fin.copy() df_savings.rename(columns={'energy':'acut_post_energy'}, inplace=True) df_savings['base_post_energy'] = baseline_pred # Interpretation if generate_summary: if not best_model.startswith('LR'): _ , local_shap_values = return_interpretation_data(mvbep_state=self.mvbep_state, global_sample_size=1, local_sample_size=df_savings.shape[0], df_input = df_savings, design_matrix_features= prediction_features) df_savings = local_shap_values # Summary if generate_summary: GenerateQuantificationSummary(mvbep_state = self.mvbep_state, df_init = initializer.df_fin, df_savings = df_savings, df_timestamps_highlights = initializer.df_timestamps_highlights, df_validation = initializer.df_validation, data_sufficiency = initializer.data_sufficiency, file_name = file_name) else: return baseline_pred