init

3d6e41b7 · dasharatha.vamshi · 0a552b16 · 3d6e41b7 · 3d6e41b7 · 3d6e41b7
Commit 3d6e41b7 authored Mar 11, 2021 by dasharatha.vamshi
10 changed files
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# pycharm
+.idea/
+
+# logs
+logs/
\ No newline at end of file
--- a/Dockerfile
+++ b/Dockerfile
+FROM python:3.7-slim
+ADD . /opt
+WORKDIR /opt
+RUN pip install -r requirements.txt
+CMD python main.py
--- a/conf/configuration.yml
+++ b/conf/configuration.yml
+#---------------Service Configurations----------------#
+SERVICE_CONFIG:
+  LOG_LEVEL: info
+  LOG_HANDLER_NAME: RandomForest
+  LOGSTASH_HOST: 192.168.1.47
+  LOGSTASH_PORT: 5000
+
+#--------------System Configurations--------------------#
+# for testing use temp/mnt/ilens/ai-jobs/
+SYSTEM_CONFIG:
+  shared_mount_base_ai_job: /mnt/ilens/ai-jobs/
+  default_run_id: default_run_id
+  min_samples_split: 10
+  min_samples_leaf: 4
+
+
+#----------------------If read conf from mongo------------#
+FOR_EACH_MONGO_CONFIG:
+  READ_FROM_MONGO: true
+  MONGO_URI: mongodb://192.168.0.210:27017
+  MONGO_DB: iLensAiPipeline
+  MONGO_RUN_COLL: runMetadata
+  MONGO_SITE_COLL: siteMetadata
--- a/main.py
+++ b/main.py
+import traceback
+from scripts.common.config_parser import config
+from scripts.common.constants import ModelObjectConstants, ComponentExceptions
+from scripts.common.logsetup import logger
+from sklearn.ensemble import RandomForestRegressor
+import pickle
+import os
+
+
+class RandomForest:
+    def __init__(self, model_object, component_out_dir, query):
+        self.model_object = model_object
+        self.component_out_dir = component_out_dir
+        self.query = query
+        self.n_estimators = self.query['n_estimators']
+        self.min_samples_split = self.query['min_samples_split']
+        self.min_samples_leaf = self.query['min_samples_leaf']
+        self.max_features = self.query['max_features']
+        self.max_depth = self.query['max_depth']
+        self.bootstrap = self.query['bootstrap']
+        self.criterion = self.query['criterion']
+        self.parameters = {
+            "n_estimators": self.n_estimators,
+            "min_samples_split": self.min_samples_split,
+            "min_samples_leaf": self.min_samples_leaf,
+            "max_features": self.max_features,
+            "max_depth": self.max_depth,
+            "bootstrap": self.bootstrap,
+            "criterion": self.criterion
+        }
+
+    def random_forest(self):
+        try:
+            logger.info("Creating Model object for the model " + self.model_object)
+            logger.info("These are the parameters used " + str(self.parameters))
+            rf = RandomForestRegressor(n_estimators=self.n_estimators, min_samples_split=self.min_samples_split,
+                                       min_samples_leaf=self.min_samples_leaf, max_features=self.max_features,
+                                       max_depth=self.max_depth, bootstrap=self.bootstrap,criterion=self.criterion)
+            filename = 'random_forest.pkl'
+            logger.info("Pickling the model.....")
+            pickle.dump(rf, open(os.path.join(self.component_out_dir, filename), 'wb'))
+            return True
+        except Exception as e:
+            logger.info(traceback.format_exc())
+            raise ValueError(e)
+
+
+if __name__ == '__main__':
+    try:
+        obj = RandomForest(config['model_name'], config['component_output_dir'], config)
+        if config['model_name'] == ModelObjectConstants.RANDOMFOREST:
+            val = obj.random_forest()
+            if val:
+                if len(os.listdir(config['component_output_dir'])) > 0:
+                    logger.info("File created Successfully")
+                else:
+                    logger.info("The output directory is empty")
+            else:
+                logger.info("Random Forest Component Failed")
+                logger.info(traceback.format_exc())
+    except Exception as e:
+        logger.info("Random Forest Component Failed")
+        logger.info(traceback.format_exc())
--- a/requirements.txt
+++ b/requirements.txt
+requests
+pyyaml
+python-logstash-async
+pymongo
+sklearn
+pandas
--- a/scripts/common/config_parser.py
+++ b/scripts/common/config_parser.py
+#!/usr/bin/env python
+import os
+import sys
+import yaml
+import json
+from pymongo import MongoClient, DESCENDING
+from scripts.common.constants import ModelObjectConstants
+
+
+def str2bool(txt):
+    if str(txt).lower() in ['True', 'true', 1, '1', True, 'yes']:
+        return True
+    return False
+
+
+config_path = os.path.join(os.getcwd(), "conf", "configuration.yml")
+if os.path.exists(config_path):
+    sys.stderr.write("Reading config from --> {}".format(config_path))
+    sys.stderr.write("\n")
+
+    with open(config_path, 'r') as stream:
+        _config = yaml.safe_load(stream)
+else:
+    sys.stderr.write("Configuration not found...")
+    sys.stderr.write("Exiting....")
+    sys.exit(1)
+
+# uncomment for Testing
+# os.environ['pipeline_id'] = "pipe1"
+# os.environ['model_name'] = 'RANDOMFOREST'
+# os.environ['bootstrap'] = "True"
+# os.environ['max_features'] = 'sqrt'
+# os.environ['n_estimators'] = "800"
+# os.environ['criterion'] = 'mse'
+# os.environ['max_depth'] = '90'
+# ------------------------ Configurations -----------------------------------------------------------------------------
+bootstrap = str2bool(os.environ.get('bootstrap', default=True))
+max_features = os.environ.get('max features', default='sqrt')
+min_samples_split = int(os.environ.get("min_samples_split", _config.get("SYSTEM_CONFIG", {}).get('min_samples_split')))
+min_samples_leaf = int(os.environ.get("min_samples_leaf", _config.get("SYSTEM_CONFIG", {}).get('min_samples_leaf')))
+pipeline_id = os.environ.get('PIPELINE_ID', default="pipeline_313")
+shared_mount_base_ai_job = os.environ.get("shared_mount_base_ai_job",
+                                          _config.get("SYSTEM_CONFIG", {}).get('shared_mount_base_ai_job'))
+# read from $shared_mount_base_ai_job/$pipeline_id/run.config
+# run_id_path = shared_mount_base_ai_job + "/" + pipeline_id + "/run_config.json"
+run_id_path = os.path.join(shared_mount_base_ai_job, pipeline_id, "run_config.json")
+try:
+    sys.stderr.write("Checking for run id parameters at path " + run_id_path + "\n")
+    with open(run_id_path) as f:
+        run_id_param = json.load(f)
+    run_id = run_id_param['run_id']
+except Exception as e:
+    sys.stderr.write("No run_config.json file is there so keeping run id as default_run_id  " + "\n")
+    run_id = _config.get("SYSTEM_CONFIG", {}).get('default_run_id')
+
+# $shared_mount_base_ai_job/$pipeline_id/$run_id/PreprocessWeatherData/input/$artifact_Name component_output_dir =
+# shared_mount_base_ai_job + "/" + pipeline_id + "/" + run_id + "/" + GetDataFromStoreConstants.NEXT_COMPONENT +
+# "/input"
+component_parameter_path = os.path.join(shared_mount_base_ai_job, pipeline_id, run_id,
+                                        ModelObjectConstants.COMPONENT_NAME, "param.json")
+try:
+    sys.stderr.write("Checking for component parameters at path " + component_parameter_path + "\n")
+    with open(component_parameter_path) as f:
+        component_parameter = json.load(f)
+    n_estimators = int(component_parameter['n_estimators'])
+    criterion = component_parameter['criterion']
+    max_depth = int(component_parameter['max_depth'])
+except Exception as e:
+    sys.stderr.write("No param.json file so trying to take from env" + "\n")
+    try:
+        n_estimators = int(os.environ.get('n_estimators'))
+        criterion = os.environ.get('criterion')
+        max_depth = int(os.environ.get('max_depth'))
+    except Exception as e:
+        raise KeyError(e)
+component_output_dir = os.path.join(shared_mount_base_ai_job, pipeline_id, run_id,
+                                    ModelObjectConstants.NEXT_COMPONENT, "input")
+
+component_input_dir = os.path.join(shared_mount_base_ai_job, pipeline_id, run_id,
+                                   ModelObjectConstants.COMPONENT_NAME, "input")
+
+model_name = os.environ.get('model_name', default="randomforest").lower()
+BASE_LOG_PATH = os.path.join(os.getcwd(), "logs")
+if not os.path.exists(os.path.join(os.getcwd(), 'logs')):
+    os.mkdir(os.path.join(os.getcwd(), 'logs'))
+
+LOG_LEVEL = os.environ.get("LOG_LEVEL", _config.get('SERVICE_CONFIG', {}).get("LOG_LEVEL", "INFO")).upper()
+LOG_HANDLER_NAME = _config.get('SERVICE_CONFIG', {}).get("LOG_HANDLER_NAME", "RandomForest")
+ENABLE_LOGSTASH_LOG = os.environ.get("ENABLE_LOGSTASH_LOG", 'False').lower()
+LOGSTASH_HOST = _config.get('SERVICE_CONFIG', {}).get('LOGSTASH_HOST')
+LOGSTASH_PORT = str(_config.get('SERVICE_CONFIG', {}).get('LOGSTASH_PORT'))
+
+config = {
+    'pipeline_id': pipeline_id,
+    'run_id': run_id,
+    'shared_mount_base_ai_job': shared_mount_base_ai_job,
+    'component_output_dir': component_output_dir,
+    'model_name': model_name,
+    'bootstrap': bootstrap,
+    'max_features': max_features,
+    'min_samples_leaf': min_samples_leaf,
+    'min_samples_split': min_samples_split,
+    'n_estimators': n_estimators,
+    'criterion': criterion,
+    'max_depth': max_depth
+}
+if not os.path.exists(config['shared_mount_base_ai_job']):
+    sys.stderr.write("Shared path does not exist!" + "\n")
+    sys.stderr.write("Creating path --> {}".format(config['shared_mount_base_ai_job'] + "\n"))
+    os.makedirs(config['shared_mount_base_ai_job'])
+
+if not os.path.exists(config['component_output_dir']):
+    sys.stderr.write("component_output_dir does not exist!" + "\n")
+    sys.stderr.write("Creating path --> {}".format(config['component_output_dir'] + "\n"))
+    os.makedirs(config['component_output_dir'])
--- a/scripts/common/constants.py
+++ b/scripts/common/constants.py
+#!/usr/bin/env python
+
+
+class ModelObjectConstants:
+    NEXT_COMPONENT = "ModelFitting"
+    COMPONENT_NAME = "RandomForest"
+    HTTP = "http://"
+    RANDOMFOREST = "randomforest"
+    LOG_VAR_MESSAGE = "\n" + "#" * 25 + "\n" + "{}" + "\n" + "#" * 25 + "\n" + "{}\n"
+
+
+class ComponentExceptions:
+    INVALID_AZURE_FILE_PATH_EXCEPTION = "AZURE PATH ERROR"
+    INVALID_LOCAL_FILE_PATH_EXCEPTION = "No File in the local path"
+    INVALID_ARTIFACT_BASE_PATH_EXCEPTION = "Artifact base path value is missing"
+    INVALID_AZURE_FILE_NAME_EXCEPTION = "Artifact name is missing"
+    INVALID_CONTAINER_NAME = "Container name is missing"
--- a/scripts/common/logsetup.py
+++ b/scripts/common/logsetup.py
+import os
+import logging
+from logging.handlers import RotatingFileHandler
+from logstash_async.handler import AsynchronousLogstashHandler
+from scripts.common.config_parser import LOG_LEVEL, LOG_HANDLER_NAME, BASE_LOG_PATH
+from scripts.common.config_parser import LOG_LEVEL, LOG_HANDLER_NAME, BASE_LOG_PATH, LOGSTASH_HOST, LOGSTASH_PORT, ENABLE_LOGSTASH_LOG
+
+DEFAULT_FORMAT = '%(asctime)s %(levelname)5s %(name)s %(message)s'
+DEBUG_FORMAT = '%(asctime)s %(levelname)5s %(name)s [%(threadName)5s:%(filename)5s:%(funcName)5s():%(lineno)s] %(' \
+               'message)s '
+EXTRA = {}
+FORMATTER = DEFAULT_FORMAT
+if LOG_LEVEL.strip() == "DEBUG":
+    FORMATTER = DEBUG_FORMAT
+logging.trace = logging.DEBUG - 5
+logging.addLevelName(logging.DEBUG - 5, 'TRACE')
+
+
+class ILensLogger(logging.getLoggerClass()):
+    def __init__(self, name):
+        super().__init__(name)
+
+    def trace(self, msg, *args, **kwargs):
+        if self.isEnabledFor(logging.trace):
+            self._log(logging.trace, msg, args, **kwargs)
+
+
+def get_logger(log_handler_name):
+    """
+    Purpose : To create logger .
+    :param log_handler_name: Name of the log handler.
+    :return: logger object.
+    """
+    log_path = os.path.join(BASE_LOG_PATH, log_handler_name + ".log")
+    logging.setLoggerClass(ILensLogger)
+    _logger = logging.getLogger(log_handler_name)
+    _logger.setLevel(LOG_LEVEL.strip().upper())
+    log_handler = logging.StreamHandler()
+    log_handler.setLevel(LOG_LEVEL)
+    formatter = logging.Formatter(FORMATTER)
+    log_handler.setFormatter(formatter)
+    handler = RotatingFileHandler(log_path, maxBytes=10485760,
+                                  backupCount=5)
+    handler.setFormatter(formatter)
+    _logger.addHandler(log_handler)
+    _logger.addHandler(handler)
+    if ENABLE_LOGSTASH_LOG == 'true' and LOGSTASH_PORT is not None and LOGSTASH_HOST is not None and LOGSTASH_PORT.isdigit():
+        _logger.addHandler(AsynchronousLogstashHandler(LOGSTASH_HOST, int(LOGSTASH_PORT), database_path=None))
+
+    return _logger
+
+
+logger = get_logger(LOG_HANDLER_NAME)
--- a/temp/mnt/ilens/ai-jobs/pipe1/default_run_id/ModelFitting/input/random_forest.pkl
+++ b/temp/mnt/ilens/ai-jobs/pipe1/default_run_id/ModelFitting/input/random_forest.pkl
--- a/temp/mnt/ilens/ai-jobs/pipe1/default_run_id/RandomForest/param.json
+++ b/temp/mnt/ilens/ai-jobs/pipe1/default_run_id/RandomForest/param.json
+{
+  "n_estimators": 900,
+  "criterion": "mse",
+  "max_depth": 100
+}
\ No newline at end of file