updated

925ca1fc · dasharatha.vamshi · e5389819 · 925ca1fc · 925ca1fc · 925ca1fc
Commit 925ca1fc authored Feb 07, 2023 by dasharatha.vamshi
Show whitespace changes
Inline Side-by-side

Showing with 3398 additions and 13 deletions

.gitignore .gitignore +584 -1

app.py app.py +5 -1

logs.log logs.log +2740 -0

mlflow_util.py mlflow_util.py +63 -11

pycaret_util.py pycaret_util.py +6 -0

No files found.
--- a/.gitignore
+++ b/.gitignore
+# Created by .ignore support plugin (hsz.mobi)
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+reports/*.pdf
+reports/*.csv
+reports/*.xlsx
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+.idea
+logs
+# Generated files
+.idea/**/contentModel.xml
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+### Python template
+# Byte-compiled / optimized / DLL files
 __pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+### VisualStudio template
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+# Mono auto generated files
+mono_crash.*
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+# StyleCop
+StyleCopReport.xml
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+# Chutzpah Test files
+_Chutzpah*
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+# Visual Studio Trace Files
+*.e2e
+# TFS 2012 Local Workspace
+$tf/
+# Guidance Automation Toolkit
+*.gpState
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+# TeamCity is a build add-in
+_TeamCity*
+# DotCover is a Code Coverage Tool
+*.dotCover
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+# Web workbench (sass)
+.sass-cache/
+# Installshield output folder
+[Ee]xpress/
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+# Click-Once directory
+publish/
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+# Microsoft Azure Emulator
+ecf/
+rcf/
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+# RIA/Silverlight projects
+Generated_Code/
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+# Microsoft Fakes
+FakesAssemblies/
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+# Visual Studio 6 build log
+*.plg
+# Visual Studio 6 workspace options file
+*.opt
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+# FAKE - F# Make
+.fake/
+# CodeRush personal settings
+.cr/personal
+# Python Tools for Visual Studio (PTVS)
+*.pyc
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+# Tabs Studio
+*.tss
+# Telerik's JustMock configuration file
+*.jmconfig
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+# OpenCover UI analysis results
+OpenCover/
+# Azure Stream Analytics local run output
+ASALocalRun/
+# MSBuild Binary and Structured Log
+*.binlog
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+# Local History for Visual Studio
+.localhistory/
+# BeatPulse healthcheck temp database
+healthchecksdb
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+### JupyterNotebooks template
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+.env
--- a/app.py
+++ b/app.py
@@ -4,5 +4,9 @@ from mlflow_util import ModelReTrainer
 if __name__ == "__main__":
    df = pd.read_csv('mlflow-test.csv')
-    obj = ModelReTrainer(df, 'instantaneous_export', 'Dalmia Solar Forecasting V2', 'Forecasting_kadapa_v1', 'versioning')
+    feature = 'instantaneous_export'
+    exp_name = 'Dalmia Solar Forecasting V2'
+    parent_run_name = 'Forecasting_kadapa_v1'
+    model_name = 'versioning'
+    obj = ModelReTrainer(df, feature, exp_name, parent_run_name, model_name)
    obj.get_latest_model()
--- a/logs.log
+++ b/logs.log
--- a/mlflow_util.py
+++ b/mlflow_util.py
@@ -107,6 +107,10 @@ class ModelReTrainer:
        self._gbm_ = GetBestModel(self.df, self.output_feature, self.list_of_models)
    def check_create_experiment(self):
+        """
+        check if experiment exists, if not creates a new experiment
+        :return: experiment_id of the experiment
+        """
        experiment_info = mlflow.get_experiment_by_name(self.experiment_name)
        if experiment_info is None:
            logger.info(f"No experiment found with name {self.experiment_name}, So creating one")
@@ -119,6 +123,11 @@ class ModelReTrainer:
        return experiment_id
    def check_create_parent_run(self, experiment_id):
+        """
+        check if a parent run exists in the experiment, if not create it with the mentioned parent run name
+        :param experiment_id: Experiment id
+        :return: returns the parent run id
+        """
        parent_runs_df = mlflow.search_runs(experiment_id)
        parent_runs_df = parent_runs_df[parent_runs_df['tags.mlflow.runName'] == self.parent_run_name]
        if not parent_runs_df.empty:
@@ -131,20 +140,35 @@ class ModelReTrainer:
            return run.info.run_id
    def check_create_child_run(self, experiment_id, parent_run_id):
+        """
+        check if a child run exists in the experiment id under the parent run id
+        if exists take the child run id which has the model saved and validate when was it lastly trained.
+        Based on the lastly trained see if you have to retrain or not. if retrain create a new child run
+        else if no child run exists under the parent run id of experiment id, create a new child run
+        :param experiment_id: experiment id
+        :param parent_run_id: parent run id
+        :return: child run id, retrain flag
+        """
        child_runs_df = mlflow.search_runs(experiment_id, filter_string=f"tags.mlflow.parentRunId='{parent_run_id}'")
        if not child_runs_df.empty:
            logger.info(f"Already Child runs are present for Parent Run Id {parent_run_id}")
-            # child_runs_df.to_csv('child.csv',index=False)
+            child_run_id, retrain = self.get_latest_child_run(experiment_id, parent_run_id, child_runs_df)
-            child_run_id, child_run_history, retrain = self.get_latest_child_run(experiment_id, parent_run_id,
+            return child_run_id, retrain
-                                                                                 child_runs_df)
-            return child_run_id, child_run_history, retrain
        else:
            logger.info(f"Child runs are not present for Parent Run Id {parent_run_id}")
            with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True):
                with mlflow.start_run(experiment_id=experiment_id, nested=True) as child_run:
-                    return child_run.info.run_id, None, True
+                    return child_run.info.run_id, True
    def get_latest_child_run(self, experiment_id, parent_run_id, runs_df):
+        """
+        Check if child runs are present. if not create a new child run. Otherwise, validate the last run time and
+        create a new child run if retraining needed or take the last child run id which has model saved
+        :param experiment_id: experiment id
+        :param parent_run_id: parent run id
+        :param runs_df: the child runs of the parent id
+        :return: last child run id, retrain flag
+        """
        history_key = 'tags.mlflow.log-model.history'
        if history_key in runs_df.columns:
            runs_df = runs_df[runs_df[history_key].notna()]
@@ -154,20 +178,24 @@ class ModelReTrainer:
            logger.info("Existing Child Runs doesn't contain any model to run. So creating new child run")
            with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True):
                with mlflow.start_run(experiment_id=experiment_id, nested=True) as child_run:
-                    return child_run.info.run_id, None, True
+                    return child_run.info.run_id, True
        latest_child_run_id = list(runs_df['run_id'])[0]
-        latest_child_history = list(runs_df['tags.mlflow.log-model.history'])[0]
        latest_run_info = runs_df.iloc[:1]
        retrain = False
        day_check_flag = self.check_existing_model_retrain(latest_child_run_id, latest_run_info, retrain)
        if day_check_flag:
            with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True):
                with mlflow.start_run(experiment_id=experiment_id, nested=True) as child_run:
-                    return child_run.info.run_id, None, True
+                    return child_run.info.run_id, True
-        return latest_child_run_id, latest_child_history, retrain
+        return latest_child_run_id, retrain
    @staticmethod
    def load_model_pyfunc(model_path):
+        """
+        Function to load the model from mlflow artifact path
+        :param model_path: model path on mlflow
+        :return: loaded model
+        """
        try:
            model = mlflow.pyfunc.load_model(model_path)
            logger.info("loading the model")
@@ -176,7 +204,15 @@ class ModelReTrainer:
            logger.exception(str(e))
    def check_existing_model_retrain(self, latest_child_run_id, child_run_info, retrain):
-        # edit this to check the time difference between the last trained model and the configured time difference
+        """
+        If retrain is True, it returns true as retraining is required.
+        If retrain is False, it checks the time difference between the last child run and the current time and returns
+        true or false depending on the time difference
+        :param latest_child_run_id: last child run id
+        :param child_run_info: last child run info
+        :param retrain: retrain flag
+        :return: final retrain flag
+        """
        if retrain:
            logger.info("Retraining Needed...")
            return True
@@ -195,6 +231,11 @@ class ModelReTrainer:
                return False
    def forming_loading_path(self, latest_run_id):
+        """
+        Creates the path from the child run id
+        :param latest_run_id: latest child run id
+        :return: the path to the model
+        """
        try:
            model_name = self.model_save_name
            return f"runs:/{latest_run_id}/{model_name}"
@@ -202,6 +243,13 @@ class ModelReTrainer:
            logger.exception(f"Exception while forming loading path  - {e}")
    def model_trainer(self, experiment_id, parent_run_id, child_run_id):
+        """
+        Using the experiment id, parent run id and child run id, it will train the model
+        :param experiment_id: experiment id
+        :param parent_run_id: parent run id
+        :param child_run_id: child run id
+        :return: the final model
+        """
        with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True):
            with mlflow.start_run(experiment_id=experiment_id, run_id=child_run_id, nested=True):
                model, model_name, metrics, hyperparams = self._gbm_.compare_get_best_model(self.fine_tune_tech,
@@ -213,9 +261,13 @@ class ModelReTrainer:
        return model
    def get_latest_model(self):
+        """
+        This is the Main function which will return the latest model
+        :return:
+        """
        experiment_id = self.check_create_experiment()
        parent_run_id = self.check_create_parent_run(experiment_id)
-        child_run_id, child_run_history, retrain = self.check_create_child_run(experiment_id, parent_run_id)
+        child_run_id, retrain = self.check_create_child_run(experiment_id, parent_run_id)
        logger.info(f"Retrain flag is {retrain}")
        if retrain:
            logger.info("Retraining needed")

--- a/pycaret_util.py
+++ b/pycaret_util.py
@@ -10,6 +10,12 @@ class GetBestModel:
        self.list_of_models = list_of_models
    def compare_get_best_model(self, fine_tune_tech, comparison_metric):
+        """
+        Train and compares the model based on the finetune tech and comparison metric
+        :param fine_tune_tech: search library for fine-tuning of the selected model
+        :param comparison_metric: metrics to select the best model
+        :return: the best model, model name, metrics and parameters
+        """
        try:
            logger.info("Using Pycaret to train mentioned models")
            regression.setup(data=self.df, target=self.target_col_list)