"""
Measurement and Verification Building Energy Prediction (MVBEP) is a ``class`` that encompasses different
modules for reading and validating input data to transforming such data and using them to develop regression
models for savings estimations in the post-retrofit period.
The ``class`` is fitted by using :py:meth:`~mvbep.mvbep.MVBEP.fit_training()` which takes in the required input data. Followingly, an
initialization summary is produced to check the data sufficiency requirements or the need for any actions
to fix the input data. If the data met the requirements to build a model, the function :py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`
is used to transform the data, train, and evaluate regression models. :py:meth:`~mvbep.mvbep.MVBEP.generate_development_summary()` function
can be used to see the summary of the development process. Finally, savings are estimated passing using post-retrofit
data to :py:meth:`~mvbep.mvbep.MVBEP.predict_energy_consumption()` function. The current state of the documentation covers only :py:class:`~mvbep.mvbep.MVBEP` class.
Future additions to the project includes writing the documentation for the remaining modules (i.e. :py:class:`~mvbep.Initializer.initializer`,
:py:class:`~mvbep.transformer.Transformer`, :py:class:`~mvbep.developer.Developer`, :py:class:`~mvbep.interpreter.Interpreter`, and :py:class:`~mvbep.writer.Writer`).
Please check the provided Notebooks for the package demonstration.
"""
from datetime import datetime
import tracemalloc
import pandas as pd
from .initializer import Initializer
from .transformer import Transformer
from .developer import Developer
import sys
from .writer import GenerateInitializationSummary, GenerateMVBEPSummary, GenerateQuantificationSummary
from .interpreter import return_interpretation_data
import sys
import os
import joblib
[docs]class MVBEP:
""" MVBEP class to perform all steps of building an energy consumption baseline.
The class incorporates the 4 required modules for building a baseline starting
from initialization to savings quantification.
Parameters
----------
mvbep_state_path : str, default 'None'
The file path for a saved MVBEP state in case the baseline creation process
stopped before the final step and saved by :py:meth:`~mvbep.mvbep.MVBEP.save_mvbep_state()` .
Example
----------
In case a object of MVBEP was saved by using :py:meth:`~mvbep.mvbep.MVBEP.save_mvbep_state()`,
it can be loaded like
>>> mvbep_boulder = MVBEP(mvbep_state_path = 'mvbep_states/office-boulder_mvbep-state')
In case there was no object saved before, an instance of MVBEP is created by
>>> mvbep_boulder = MVBEP()
"""
def __init__(self, mvbep_state_path:str=None):
self.mvbep_state = {
'mvbep':{
'date' : str(datetime.now().strftime("%Y%m%d-%H%M%S")),
'version' : 1.0,
'best_model' : None,
'development_state': 'NOT INITIATED'
},
'initializer':{
'cleaned_data' : None,
'frequency' : None,
'features' : None,
'country_code' : None,
'df_validation' : None,
'data_sufficiency' : None,
'occupancy_schedule' : None,
'df_timestamps_highlights' : None
},
'transformer':{
'mvbep_frequency' : None,
'design_matrices_features' : None
},
'developer':{
'training_inputs':{
'modeling_methods' : None,
'test_size' : None,
'hyperparameter_tuning' : None,
'ranking_method' : None
},
'training_outputs':{
'training_summary': None,
'frequency':{
'15-min':{
'models_dict' : None,
'summary':{
'evaluation': None,
'plot_data' : None
}
},
'hourly':{
'models_dict' : None,
'summary':{
'evaluation': None,
'plot_data' : None
}
},
'daily': {
'models_dict' : None,
'summary':{
'evaluation': None,
'plot_data' : None
}
}
}
}
}
}
if mvbep_state_path is not None:
with open(mvbep_state_path, 'rb') as f:
self.mvbep_state = joblib.load(f)
else:
pass
[docs] def fit_training(self,
data:pd.DataFrame,
frequency:str,
country_code:str = None,
occupancy_schedule:dict = None,
mismatch_date_threshold:float = 0.3,
total_missing = None,
max_consec_missing = None,
n_days = 360
):
""" Fits a MVBEP object with raw data.
This is the first method in developing an energy consumption baseline. The
method takes required historical data to prepare them for next processes.
Parameters
----------
data : pd.DataFrame
A dataframe that includes the required data which includes at least
- Timestamps in 15-min or hourly intervals
- Energy consumption
- Outdoor dry-bulb temperature
frequency : str, {'15-min', 'hourly'}
The timestamps intervals frequency.
country_code : str, default to 'None'
A two-letter ``str`` indicating the country code in which the building resides.
The supported codes are listed in holiday package `documentation <https://pypi.org/project/holidays/>`_
occupancy_schedule : dict, default to 'None'
A ``dict`` indicating the general occupancy density in the building. [Check
the parameter structure ](??)
mismatch_date_threshold : float, default to 0.3
Sets the threshold for values in ``timestamp`` column that cannot be converted from ``str``
to ``datetime`` object.
total_missing : int, default to 'None'
Sets a threshold for the total number of a feature's missing observations to meet
data sufficiency requirements. The default value is set based on frequency.
max_consec_missing : int, default to 'None'
Sets a threshold for consecutive missing observations in a single feature before
the feature is dropped. The default value is set based on frequency.
n_days : int, default to 365
Sets a threshold for the least number of days in ``data`` .
Example
----------
Example of a building located in Boulder, CO, USA with hourly timestamps. The instance of MVBEP
was created with a nmae of ``mvbep_boulder`` .
>>> mvbep_boulder.fit_training(
... data = df_boulder_office,
... frequency = 'hourly',
... country_code = 'US'
... )
"""
# Checking the MVBEP object state
if self.mvbep_state['mvbep']['development_state'] != 'NOT INITIATED':
if self.mvbep_state['mvbep']['development_state'] == 'DEVELOPED':
print("The MVBEP started the initiation process and finished developing a MVBEP model. To predict baseline values run predict_energy_consumption()")
sys.exit()
else:
print("The MVBEP started the initiation but didn't start developing a MVBEP model. To start development run develop_mvbep()")
sys.exit()
# Creating an initializer and validating the passed data
initializer = Initializer(mismatch_date_threshold = mismatch_date_threshold)
initializer.fit(data = data,
frequency = frequency,
country_code = country_code,
occupancy_schedule = occupancy_schedule
)
initializer.validate(total_missing = total_missing,
max_consec_missing = max_consec_missing,
n_days = n_days
)
# Updating the MVBEP object state
if initializer.data_sufficiency:
self.mvbep_state['mvbep']['development_state'] = 'INITIATED'
else:
self.mvbep_state['mvbep']['development_state'] = 'FAILED INITIATION'
self.mvbep_state['initializer']['cleaned_data'] = initializer.df_fin
self.mvbep_state['initializer']['frequency'] = initializer.frequency
self.mvbep_state['initializer']['features'] = initializer.features
self.mvbep_state['initializer']['country_code'] = initializer.country_code
self.mvbep_state['initializer']['occupancy_schedule'] = initializer.occupancy_schedule
self.mvbep_state['initializer']['df_validation'] = initializer.df_validation
self.mvbep_state['initializer']['data_sufficiency'] = initializer.data_sufficiency
self.mvbep_state['initializer']['df_timestamps_highlights'] = initializer.df_timestamps_highlights
[docs] def generate_initialization_summary(self,
file_name:str = None
):
""" Generates summary of the initialization performed after :py:meth:`~mvbep.mvbep.MVBEP.fit_training()` .
The initialization summary is generated as an HTML file with highlights of the
initialization process including plots, descriptive data, and data sufficiency result.
Parameters
----------
file_name : str, default to 'None'
Sets the name of the HTML initialization summary. In case no name was provided,
the resulting name will be ``initiation_time`` + ``init_sum_`` .
Example
----------
Writing the initialization summary of ``mvbep_boulder`` after running :py:meth:`~mvbep.mvbep.MVBEP.fit_training()` .
>>> mvbep_boulder.generate_initialization_summary(file_name = 'mvbep_summaries/office-boulder_init-summary')
"""
if self.mvbep_state['mvbep'] == 'NOT INITIATED':
print('The MVBEP object has not been initiated. Initiate the model using fit_training().')
sys.exit()
elif self.mvbep_state['mvbep']['development_state'] == 'FAILED INITIATION':
print("The MVBEP object failed the initiation process. Fix the data and run fit_training()")
sys.exit()
else:
GenerateInitializationSummary(file_name = file_name,
df_input = self.mvbep_state['initializer']['cleaned_data'],
frequency = self.mvbep_state['initializer']['frequency'],
features = self.mvbep_state['initializer']['features'],
df_timestamps_highlights = self.mvbep_state['initializer']['df_timestamps_highlights'],
df_validation = self.mvbep_state['initializer']['df_validation'],
data_sufficiency = self.mvbep_state['initializer']['data_sufficiency'])
[docs] def develop_mvbep(self,
modeling_methods:dict = None,
test_size:float = 0.2,
hyperparameter_tuning:bool = False,
ranking_method:str = 'min_cvrmse'
):
""" Transforms the cleaned data and develops regression models.
Takes the cleaned data after :py:meth:`~mvbep.mvbep.MVBEP.fit_training()` and iterates over the possible transformations
while using each transformation to generate regression models using the chosen modeling approaches
in ``modeling_methods`` . With each transformation, outputs such as evaluation metrics and models are
saved in the MVBEP object's state (i.e. attribute ``mvbep_state`` ).
Parameters
----------
modeling_methods : dict, default to 'None'
The chosen modeling approaches to develop the baseline. In case None was passed, the argument is
passed by:
>>> default_modeling_methods = {
... 'LR' : True, # TOWT (If the frequency is hourly otherwise it is WLS)
... 'RF' : True, # Random Regression Forest
... 'XGB': True, # Extreme Gradient Boosting
... 'SVR': True, # Support Vector Regressor
... 'SLP': True, # Feed Forward Neural Network
... 'KNN': True # K-Nearest Neighbor
... }
test_size : float, default to 0.2
Sets the testing set size out of the input data.
hyperparameter_tuning : bool, defalut to ``False``
- If ``True`` : the hyperparameter tuning process is performed for any model with hyperparameters to be tuned.
- If ``False`` : No hyperparameter tuning process is performed (except for KNN).
ranking_method : str, {'min_cvrmse', 'min_nmbe'}, default to 'min_cvrmse'
Sets the ranking method to choose the best model based on the testing set evaluation.
- If ``min_cvrmse`` : The best model is selected based on Coefficient of Variation of Root Mean Squared Error (CV(RMSE))
- If 'min_nmbe': The best model is selected based on Normalized Mean Bias Error (NMBE).
Example
----------
Developing ``mvbep_boulder`` after running :py:meth:`~mvbep.mvbep.MVBEP.fit_training()` .
>>> mvbep_boulder.develop_mvbep()
"""
# Checking initialization result
if self.mvbep_state['mvbep']['development_state'] == 'NOT INITIATED':
print("The MVBEP object didn't start the initiation process. Run fit_training()")
sys.exit()
elif self.mvbep_state['mvbep']['development_state'] == 'FAILED INITIATION':
print("The MVBEP object failed the initiation process. Fix the data and run fit_training()")
sys.exit()
elif self.mvbep_state['mvbep']['development_state'] == 'DEVELOPED':
print("The MVBEP started the initiation process and finished developing a MVBEP model. To predict baseline values run predict_energy_consumption()")
sys.exit()
# Updating the MVBEP object state
self.mvbep_state['developer']['training_inputs']['test_size'] = test_size
self.mvbep_state['developer']['training_inputs']['hyperparameter_tuning'] = hyperparameter_tuning
self.mvbep_state['developer']['training_inputs']['ranking_method'] = ranking_method
# Determine possible downsampling
downsamplings = []
training_frequency = []
freq_features_dict = {'15-min':None , 'hourly':None, 'daily':None, 'towt':None}
if self.mvbep_state['initializer']['frequency'] == '15-min':
downsamplings = [None, '15-min~hourly', '15-min~daily']
training_frequency = ['15-min','hourly', 'daily']
elif self.mvbep_state['initializer']['frequency'] == 'hourly':
downsamplings = [None, 'hourly~daily']
training_frequency = ['hourly', 'daily']
else:
downsamplings = [None]
training_frequency = ['daily']
# MVBEP model development with different frequencies
for downsample, freq in zip(downsamplings, training_frequency):
#Transformation
transformer = Transformer()
transformer.fit(data = self.mvbep_state['initializer']['cleaned_data'],
timestamp_frequency = self.mvbep_state['initializer']['frequency'],
optional_features = self.mvbep_state['initializer']['features'],
occupancy_schedule = self.mvbep_state['initializer']['occupancy_schedule'],
country_code = self.mvbep_state['initializer']['country_code'],
downsample_from_to = downsample)
transformer.transform()
freq_features_dict[freq] = transformer.design_matrix_features
if freq == 'hourly':
freq_features_dict['towt'] = transformer.towt_design_matrix_features
#Training, hyperparameter tuning, and testing
developer = Developer(modeling_methods = modeling_methods,
test_size = test_size,
hyperparameter_tuning = hyperparameter_tuning,
ranking_method = ranking_method)
developer.fit(data = transformer.df_fin,
timestamp_frequency = freq,
towt_design_matrix = transformer.towt_design_matrix,
design_matrix_features = transformer.design_matrix_features,
towt_design_matrix_features = transformer.towt_design_matrix_features
)
#Saving MVBEP results
self.mvbep_state['developer']['training_inputs']['self.modeling_methods'] = developer.modeling_methods
self.mvbep_state['developer']['training_outputs']['frequency'][freq]['models_dict'] = developer.models_dict
self.mvbep_state['developer']['training_outputs']['frequency'][freq]['summary']['evaluation'] = developer.show_evaluation_metrics()
self.mvbep_state['developer']['training_outputs']['frequency'][freq]['summary']['plot_data'] = developer.return_plot_data()
# Choosing the best model and best frequency
# Summarizing outputs of each downsample iterations
dfs_eval = []
for freq, freq_dict in self.mvbep_state['developer']['training_outputs']['frequency'].items():
if freq_dict['models_dict'] is not None:
df_eval = freq_dict['summary']['evaluation'].loc[:, ['train_cvrmse', 'train_nmbe', 'test_cvrmse', 'test_nmbe']].reset_index()
df_eval['frequency'] = freq
dfs_eval.append(df_eval)
training_summary = pd.concat(dfs_eval).reset_index(drop=True)
#Saving development state
self.mvbep_state['transformer']['design_matrices_features'] = freq_features_dict
self.mvbep_state['developer']['training_outputs']['training_summary'] = training_summary
condition_col = 'test_cvrmse' if ranking_method == 'min_cvrmse' else 'test_nmbe'
self.mvbep_state['mvbep']['best_model'] = training_summary.sort_values(by=condition_col, key=abs).reset_index(drop=True)['models'][0]
self.mvbep_state['transformer']['mvbep_frequency'] = training_summary.sort_values(by=condition_col, key=abs).reset_index(drop=True)['frequency'][0]
self.mvbep_state['mvbep']['development_state'] = 'DEVELOPED'
[docs] def generate_development_summary(self,
file_name:str = None
):
""" Generates development summary after using :py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`.
Outputs an HTML file that summarizes the development process after running
:py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`.
Parameters
----------
file_name : str, default to 'None'
Sets the name of the HTML development summary. In case no name was provided,
the resulting name will be ``initiation_time`` + ``dev_sum_``.
Example
-------
Writing the initialization summary of `mvbep_boulder` after running :py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`.
>>> mvbep_boulder.generate_development_summary(file_name = 'mvbep_summaries/office-boulder_dev-summary')
"""
if self.mvbep_state['mvbep']['development_state'] != 'DEVELOPED':
print('The MVBEP object has not been developed. Develop the model using develop_mvbep.')
sys.exit()
else:
GenerateMVBEPSummary(file_name = file_name,
mvbep_state = self.mvbep_state)
[docs] def save_mvbep_state(self, file_name:str=None):
""" Saves the current progress of the MVBEP object by storing ``mvbep_state``.
Parameters
----------
file_name : str, default to 'None'
Sets the name of the ``Joblib`` state file. In case no name was provided,
the resulting name will be ``initiation_time`` + ``mvbep_state``.
Example
----------
Saving the state of either an initiated MVBEP by :py:meth:`~mvbep.mvbep.MVBEP.fit_training()` or a developed one by :py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`.
>>> mvbep.save_state('mvbep_states/office-boulder_mvbep-state')
"""
file_name = datetime.now().strftime("%Y%m%d-%H%M%S")+'_mvbep_state' if file_name is None else file_name
with open(file_name, 'wb') as f:
joblib.dump(self.mvbep_state, f, compress=5)
[docs] def predict_energy_consumption(self,
data:pd.DataFrame,
generate_summary:bool = False,
file_name:str = None,
mismatch_date_threshold = 0.3,
total_missing = None,
max_consec_missing = None
):
""" Generates savings quantification summary after using :py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`.
Outputs an HTML file that summarizes the quantification process after running
:py:meth:`~mvbep.mvbep.MVBEP.develop_mvbep()`. The quantification process requires post-retrofit
data that matches the same frequency and features of the data used in initialization when running
:py:meth:`~mvbep.mvbep.MVBEP.fit_training()`. Features that was dropped in the initialization
process are not required in the post-retrofit data. To see which features passed the
initialization process, check the output of :py:meth:`~mvbep.mvbep.MVBEP.generate_initialization_summary()`.
Parameters
----------
data : pd.DataFrame
The post-retrofit data.
generate_summary :bool, default to False
Either generates a summary in an HTML file or return a ``list`` of baseline energy consumption.
In case the passed ``data`` does not meet the requirements, an initialization summary is generated
regardless of the passed argument in ``generate_summary``.
- If ``True``: A quantification summary is provided. The function does not return any object.
- If ``False``: A list of baseline energy consumption for the provided post-retrofit period is
returend.
file_name : str, default to 'None'
Sets the name of the HTML quantification summary. In case no name was provided,
the resulting name will be ``initiation_time`` + ``quant_sum_``.
mismatch_date_threshold : float, default to 0.3
Sets the threshold for values in `timestamp` column that cannot be converted from ``str``
to ``pd.datetime`` object.
total_missing : int, default to 'None'
Sets a threshold for the total number of a feature's missing observations to meet
data sufficiency requirements. The value is set based on frequency.
max_consec_missing : int, default to 'None'
Sets a threshold for consecutive missing observations in a single feature before
the feature is dropped. The value is set based on frequency.
Example
----------
Writing the quantification summary of ``mvbep_boulder``.
>>> mvbep_boulder.predict_energy_consumption(data = df_boulder_post_retrofit,
... generate_summary = True,
... file_name='mvbep_summaries/office-boulder_dev-summary')
"""
# Check MVBEP state
if self.mvbep_state['mvbep']['development_state'] != 'DEVELOPED':
print('The MVBEP object has not been developed. Develop the model using develop_mvbep.')
sys.exit()
# Initialization
mismatch_date_threshold
initializer = Initializer(mvbep_state = self.mvbep_state,
mismatch_date_threshold = mismatch_date_threshold
)
initializer.validate_pred_data(data = data,
total_missing = total_missing,
max_consec_missing = max_consec_missing
)
# Transformation
if initializer.initializer_state != 'INITIATED':
print('The post-retrofit data failed the initialization process. Check the initialization summary.')
GenerateQuantificationSummary(mvbep_state = self.mvbep_state,
df_init = initializer.df_fin,
df_savings = None,
df_timestamps_highlights = initializer.df_timestamps_highlights,
df_validation = initializer.df_validation,
data_sufficiency = initializer.data_sufficiency,
file_name = file_name)
sys.exit()
transformer = Transformer()
data_frequency = initializer.frequency
best_frequency = self.mvbep_state['transformer']['mvbep_frequency']
downsample = None if best_frequency == data_frequency else data_frequency+'~'+best_frequency
transformer.fit(data = initializer.df_fin,
timestamp_frequency = initializer.frequency,
optional_features = initializer.features,
occupancy_schedule = initializer.occupancy_schedule,
country_code = initializer.country_code,
downsample_from_to = downsample)
transformer.transform()
# Predictions
best_model = self.mvbep_state['mvbep']['best_model']
pred_pipeline = self.mvbep_state['developer']['training_outputs']['frequency'][best_frequency]['models_dict'][best_model]['model']['pipe']
if best_model == 'LR_towt':
prediction_features = self.mvbep_state['transformer']['design_matrices_features']['towt']
baseline_pred = pred_pipeline.predict(transformer.towt_design_matrix.loc[:, prediction_features])
else:
prediction_features = self.mvbep_state['transformer']['design_matrices_features'][best_frequency]
baseline_pred = pred_pipeline.predict(transformer.df_fin.loc[:, self.mvbep_state['transformer']['design_matrices_features'][best_frequency]])
# Savings
df_savings = transformer.df_fin.copy()
df_savings.rename(columns={'energy':'acut_post_energy'}, inplace=True)
df_savings['base_post_energy'] = baseline_pred
# Interpretation
if generate_summary:
if not best_model.startswith('LR'):
_ , local_shap_values = return_interpretation_data(mvbep_state=self.mvbep_state,
global_sample_size=1,
local_sample_size=df_savings.shape[0],
df_input = df_savings,
design_matrix_features= prediction_features)
df_savings = local_shap_values
# Summary
if generate_summary:
GenerateQuantificationSummary(mvbep_state = self.mvbep_state,
df_init = initializer.df_fin,
df_savings = df_savings,
df_timestamps_highlights = initializer.df_timestamps_highlights,
df_validation = initializer.df_validation,
data_sufficiency = initializer.data_sufficiency,
file_name = file_name)
else:
return baseline_pred