Commit 925ca1fc authored by dasharatha.vamshi's avatar dasharatha.vamshi

updated

parent e5389819
# Created by .ignore support plugin (hsz.mobi)
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
reports/*.pdf
reports/*.csv
reports/*.xlsx
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
.idea
logs
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
### VisualStudio template
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
# User-specific files
*.rsuser
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Mono auto generated files
mono_crash.*
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
[Ww][Ii][Nn]32/
[Aa][Rr][Mm]/
[Aa][Rr][Mm]64/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
[Ll]ogs/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUnit
*.VisualState.xml
TestResult.xml
nunit-*.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
# ASP.NET Scaffolding
ScaffoldingReadMe.txt
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_h.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*_wpftmp.csproj
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Coverlet is a free, cross platform Code Coverage Tool
coverage*.json
coverage*.xml
coverage*.info
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# NuGet Symbol Packages
*.snupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
*.appxbundle
*.appxupload
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!?*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
*- [Bb]ackup.rdl
*- [Bb]ackup ([0-9]).rdl
*- [Bb]ackup ([0-9][0-9]).rdl
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# CodeRush personal settings
.cr/personal
# Python Tools for Visual Studio (PTVS)
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/
# Local History for Visual Studio
.localhistory/
# BeatPulse healthcheck temp database
healthchecksdb
# Backup folder for Package Reference Convert tool in Visual Studio 2017
MigrationBackup/
# Ionide (cross platform F# VS Code tools) working folder
.ionide/
# Fody - auto-generated XML schema
FodyWeavers.xsd
### JupyterNotebooks template
# gitignore template for Jupyter Notebooks
# website: http://jupyter.org/
.env
...@@ -4,5 +4,9 @@ from mlflow_util import ModelReTrainer ...@@ -4,5 +4,9 @@ from mlflow_util import ModelReTrainer
if __name__ == "__main__": if __name__ == "__main__":
df = pd.read_csv('mlflow-test.csv') df = pd.read_csv('mlflow-test.csv')
obj = ModelReTrainer(df, 'instantaneous_export', 'Dalmia Solar Forecasting V2', 'Forecasting_kadapa_v1', 'versioning') feature = 'instantaneous_export'
exp_name = 'Dalmia Solar Forecasting V2'
parent_run_name = 'Forecasting_kadapa_v1'
model_name = 'versioning'
obj = ModelReTrainer(df, feature, exp_name, parent_run_name, model_name)
obj.get_latest_model() obj.get_latest_model()
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -107,6 +107,10 @@ class ModelReTrainer: ...@@ -107,6 +107,10 @@ class ModelReTrainer:
self._gbm_ = GetBestModel(self.df, self.output_feature, self.list_of_models) self._gbm_ = GetBestModel(self.df, self.output_feature, self.list_of_models)
def check_create_experiment(self): def check_create_experiment(self):
"""
check if experiment exists, if not creates a new experiment
:return: experiment_id of the experiment
"""
experiment_info = mlflow.get_experiment_by_name(self.experiment_name) experiment_info = mlflow.get_experiment_by_name(self.experiment_name)
if experiment_info is None: if experiment_info is None:
logger.info(f"No experiment found with name {self.experiment_name}, So creating one") logger.info(f"No experiment found with name {self.experiment_name}, So creating one")
...@@ -119,6 +123,11 @@ class ModelReTrainer: ...@@ -119,6 +123,11 @@ class ModelReTrainer:
return experiment_id return experiment_id
def check_create_parent_run(self, experiment_id): def check_create_parent_run(self, experiment_id):
"""
check if a parent run exists in the experiment, if not create it with the mentioned parent run name
:param experiment_id: Experiment id
:return: returns the parent run id
"""
parent_runs_df = mlflow.search_runs(experiment_id) parent_runs_df = mlflow.search_runs(experiment_id)
parent_runs_df = parent_runs_df[parent_runs_df['tags.mlflow.runName'] == self.parent_run_name] parent_runs_df = parent_runs_df[parent_runs_df['tags.mlflow.runName'] == self.parent_run_name]
if not parent_runs_df.empty: if not parent_runs_df.empty:
...@@ -131,20 +140,35 @@ class ModelReTrainer: ...@@ -131,20 +140,35 @@ class ModelReTrainer:
return run.info.run_id return run.info.run_id
def check_create_child_run(self, experiment_id, parent_run_id): def check_create_child_run(self, experiment_id, parent_run_id):
"""
check if a child run exists in the experiment id under the parent run id
if exists take the child run id which has the model saved and validate when was it lastly trained.
Based on the lastly trained see if you have to retrain or not. if retrain create a new child run
else if no child run exists under the parent run id of experiment id, create a new child run
:param experiment_id: experiment id
:param parent_run_id: parent run id
:return: child run id, retrain flag
"""
child_runs_df = mlflow.search_runs(experiment_id, filter_string=f"tags.mlflow.parentRunId='{parent_run_id}'") child_runs_df = mlflow.search_runs(experiment_id, filter_string=f"tags.mlflow.parentRunId='{parent_run_id}'")
if not child_runs_df.empty: if not child_runs_df.empty:
logger.info(f"Already Child runs are present for Parent Run Id {parent_run_id}") logger.info(f"Already Child runs are present for Parent Run Id {parent_run_id}")
# child_runs_df.to_csv('child.csv',index=False) child_run_id, retrain = self.get_latest_child_run(experiment_id, parent_run_id, child_runs_df)
child_run_id, child_run_history, retrain = self.get_latest_child_run(experiment_id, parent_run_id, return child_run_id, retrain
child_runs_df)
return child_run_id, child_run_history, retrain
else: else:
logger.info(f"Child runs are not present for Parent Run Id {parent_run_id}") logger.info(f"Child runs are not present for Parent Run Id {parent_run_id}")
with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True): with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True):
with mlflow.start_run(experiment_id=experiment_id, nested=True) as child_run: with mlflow.start_run(experiment_id=experiment_id, nested=True) as child_run:
return child_run.info.run_id, None, True return child_run.info.run_id, True
def get_latest_child_run(self, experiment_id, parent_run_id, runs_df): def get_latest_child_run(self, experiment_id, parent_run_id, runs_df):
"""
Check if child runs are present. if not create a new child run. Otherwise, validate the last run time and
create a new child run if retraining needed or take the last child run id which has model saved
:param experiment_id: experiment id
:param parent_run_id: parent run id
:param runs_df: the child runs of the parent id
:return: last child run id, retrain flag
"""
history_key = 'tags.mlflow.log-model.history' history_key = 'tags.mlflow.log-model.history'
if history_key in runs_df.columns: if history_key in runs_df.columns:
runs_df = runs_df[runs_df[history_key].notna()] runs_df = runs_df[runs_df[history_key].notna()]
...@@ -154,20 +178,24 @@ class ModelReTrainer: ...@@ -154,20 +178,24 @@ class ModelReTrainer:
logger.info("Existing Child Runs doesn't contain any model to run. So creating new child run") logger.info("Existing Child Runs doesn't contain any model to run. So creating new child run")
with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True): with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True):
with mlflow.start_run(experiment_id=experiment_id, nested=True) as child_run: with mlflow.start_run(experiment_id=experiment_id, nested=True) as child_run:
return child_run.info.run_id, None, True return child_run.info.run_id, True
latest_child_run_id = list(runs_df['run_id'])[0] latest_child_run_id = list(runs_df['run_id'])[0]
latest_child_history = list(runs_df['tags.mlflow.log-model.history'])[0]
latest_run_info = runs_df.iloc[:1] latest_run_info = runs_df.iloc[:1]
retrain = False retrain = False
day_check_flag = self.check_existing_model_retrain(latest_child_run_id, latest_run_info, retrain) day_check_flag = self.check_existing_model_retrain(latest_child_run_id, latest_run_info, retrain)
if day_check_flag: if day_check_flag:
with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True): with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True):
with mlflow.start_run(experiment_id=experiment_id, nested=True) as child_run: with mlflow.start_run(experiment_id=experiment_id, nested=True) as child_run:
return child_run.info.run_id, None, True return child_run.info.run_id, True
return latest_child_run_id, latest_child_history, retrain return latest_child_run_id, retrain
@staticmethod @staticmethod
def load_model_pyfunc(model_path): def load_model_pyfunc(model_path):
"""
Function to load the model from mlflow artifact path
:param model_path: model path on mlflow
:return: loaded model
"""
try: try:
model = mlflow.pyfunc.load_model(model_path) model = mlflow.pyfunc.load_model(model_path)
logger.info("loading the model") logger.info("loading the model")
...@@ -176,7 +204,15 @@ class ModelReTrainer: ...@@ -176,7 +204,15 @@ class ModelReTrainer:
logger.exception(str(e)) logger.exception(str(e))
def check_existing_model_retrain(self, latest_child_run_id, child_run_info, retrain): def check_existing_model_retrain(self, latest_child_run_id, child_run_info, retrain):
# edit this to check the time difference between the last trained model and the configured time difference """
If retrain is True, it returns true as retraining is required.
If retrain is False, it checks the time difference between the last child run and the current time and returns
true or false depending on the time difference
:param latest_child_run_id: last child run id
:param child_run_info: last child run info
:param retrain: retrain flag
:return: final retrain flag
"""
if retrain: if retrain:
logger.info("Retraining Needed...") logger.info("Retraining Needed...")
return True return True
...@@ -195,6 +231,11 @@ class ModelReTrainer: ...@@ -195,6 +231,11 @@ class ModelReTrainer:
return False return False
def forming_loading_path(self, latest_run_id): def forming_loading_path(self, latest_run_id):
"""
Creates the path from the child run id
:param latest_run_id: latest child run id
:return: the path to the model
"""
try: try:
model_name = self.model_save_name model_name = self.model_save_name
return f"runs:/{latest_run_id}/{model_name}" return f"runs:/{latest_run_id}/{model_name}"
...@@ -202,6 +243,13 @@ class ModelReTrainer: ...@@ -202,6 +243,13 @@ class ModelReTrainer:
logger.exception(f"Exception while forming loading path - {e}") logger.exception(f"Exception while forming loading path - {e}")
def model_trainer(self, experiment_id, parent_run_id, child_run_id): def model_trainer(self, experiment_id, parent_run_id, child_run_id):
"""
Using the experiment id, parent run id and child run id, it will train the model
:param experiment_id: experiment id
:param parent_run_id: parent run id
:param child_run_id: child run id
:return: the final model
"""
with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True): with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True):
with mlflow.start_run(experiment_id=experiment_id, run_id=child_run_id, nested=True): with mlflow.start_run(experiment_id=experiment_id, run_id=child_run_id, nested=True):
model, model_name, metrics, hyperparams = self._gbm_.compare_get_best_model(self.fine_tune_tech, model, model_name, metrics, hyperparams = self._gbm_.compare_get_best_model(self.fine_tune_tech,
...@@ -213,9 +261,13 @@ class ModelReTrainer: ...@@ -213,9 +261,13 @@ class ModelReTrainer:
return model return model
def get_latest_model(self): def get_latest_model(self):
"""
This is the Main function which will return the latest model
:return:
"""
experiment_id = self.check_create_experiment() experiment_id = self.check_create_experiment()
parent_run_id = self.check_create_parent_run(experiment_id) parent_run_id = self.check_create_parent_run(experiment_id)
child_run_id, child_run_history, retrain = self.check_create_child_run(experiment_id, parent_run_id) child_run_id, retrain = self.check_create_child_run(experiment_id, parent_run_id)
logger.info(f"Retrain flag is {retrain}") logger.info(f"Retrain flag is {retrain}")
if retrain: if retrain:
logger.info("Retraining needed") logger.info("Retraining needed")
......
...@@ -10,6 +10,12 @@ class GetBestModel: ...@@ -10,6 +10,12 @@ class GetBestModel:
self.list_of_models = list_of_models self.list_of_models = list_of_models
def compare_get_best_model(self, fine_tune_tech, comparison_metric): def compare_get_best_model(self, fine_tune_tech, comparison_metric):
"""
Train and compares the model based on the finetune tech and comparison metric
:param fine_tune_tech: search library for fine-tuning of the selected model
:param comparison_metric: metrics to select the best model
:return: the best model, model name, metrics and parameters
"""
try: try:
logger.info("Using Pycaret to train mentioned models") logger.info("Using Pycaret to train mentioned models")
regression.setup(data=self.df, target=self.target_col_list) regression.setup(data=self.df, target=self.target_col_list)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment