Commit 3d6e41b7 authored by dasharatha.vamshi's avatar dasharatha.vamshi

init

parent 0a552b16
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# pycharm
.idea/
# logs
logs/
\ No newline at end of file
FROM python:3.7-slim
ADD . /opt
WORKDIR /opt
RUN pip install -r requirements.txt
CMD python main.py
#---------------Service Configurations----------------#
SERVICE_CONFIG:
LOG_LEVEL: info
LOG_HANDLER_NAME: RandomForest
LOGSTASH_HOST: 192.168.1.47
LOGSTASH_PORT: 5000
#--------------System Configurations--------------------#
# for testing use temp/mnt/ilens/ai-jobs/
SYSTEM_CONFIG:
shared_mount_base_ai_job: /mnt/ilens/ai-jobs/
default_run_id: default_run_id
min_samples_split: 10
min_samples_leaf: 4
#----------------------If read conf from mongo------------#
FOR_EACH_MONGO_CONFIG:
READ_FROM_MONGO: true
MONGO_URI: mongodb://192.168.0.210:27017
MONGO_DB: iLensAiPipeline
MONGO_RUN_COLL: runMetadata
MONGO_SITE_COLL: siteMetadata
import traceback
from scripts.common.config_parser import config
from scripts.common.constants import ModelObjectConstants, ComponentExceptions
from scripts.common.logsetup import logger
from sklearn.ensemble import RandomForestRegressor
import pickle
import os
class RandomForest:
def __init__(self, model_object, component_out_dir, query):
self.model_object = model_object
self.component_out_dir = component_out_dir
self.query = query
self.n_estimators = self.query['n_estimators']
self.min_samples_split = self.query['min_samples_split']
self.min_samples_leaf = self.query['min_samples_leaf']
self.max_features = self.query['max_features']
self.max_depth = self.query['max_depth']
self.bootstrap = self.query['bootstrap']
self.criterion = self.query['criterion']
self.parameters = {
"n_estimators": self.n_estimators,
"min_samples_split": self.min_samples_split,
"min_samples_leaf": self.min_samples_leaf,
"max_features": self.max_features,
"max_depth": self.max_depth,
"bootstrap": self.bootstrap,
"criterion": self.criterion
}
def random_forest(self):
try:
logger.info("Creating Model object for the model " + self.model_object)
logger.info("These are the parameters used " + str(self.parameters))
rf = RandomForestRegressor(n_estimators=self.n_estimators, min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf, max_features=self.max_features,
max_depth=self.max_depth, bootstrap=self.bootstrap,criterion=self.criterion)
filename = 'random_forest.pkl'
logger.info("Pickling the model.....")
pickle.dump(rf, open(os.path.join(self.component_out_dir, filename), 'wb'))
return True
except Exception as e:
logger.info(traceback.format_exc())
raise ValueError(e)
if __name__ == '__main__':
try:
obj = RandomForest(config['model_name'], config['component_output_dir'], config)
if config['model_name'] == ModelObjectConstants.RANDOMFOREST:
val = obj.random_forest()
if val:
if len(os.listdir(config['component_output_dir'])) > 0:
logger.info("File created Successfully")
else:
logger.info("The output directory is empty")
else:
logger.info("Random Forest Component Failed")
logger.info(traceback.format_exc())
except Exception as e:
logger.info("Random Forest Component Failed")
logger.info(traceback.format_exc())
#!/usr/bin/env python
import os
import sys
import yaml
import json
from pymongo import MongoClient, DESCENDING
from scripts.common.constants import ModelObjectConstants
def str2bool(txt):
if str(txt).lower() in ['True', 'true', 1, '1', True, 'yes']:
return True
return False
config_path = os.path.join(os.getcwd(), "conf", "configuration.yml")
if os.path.exists(config_path):
sys.stderr.write("Reading config from --> {}".format(config_path))
sys.stderr.write("\n")
with open(config_path, 'r') as stream:
_config = yaml.safe_load(stream)
else:
sys.stderr.write("Configuration not found...")
sys.stderr.write("Exiting....")
sys.exit(1)
# uncomment for Testing
# os.environ['pipeline_id'] = "pipe1"
# os.environ['model_name'] = 'RANDOMFOREST'
# os.environ['bootstrap'] = "True"
# os.environ['max_features'] = 'sqrt'
# os.environ['n_estimators'] = "800"
# os.environ['criterion'] = 'mse'
# os.environ['max_depth'] = '90'
# ------------------------ Configurations -----------------------------------------------------------------------------
bootstrap = str2bool(os.environ.get('bootstrap', default=True))
max_features = os.environ.get('max features', default='sqrt')
min_samples_split = int(os.environ.get("min_samples_split", _config.get("SYSTEM_CONFIG", {}).get('min_samples_split')))
min_samples_leaf = int(os.environ.get("min_samples_leaf", _config.get("SYSTEM_CONFIG", {}).get('min_samples_leaf')))
pipeline_id = os.environ.get('PIPELINE_ID', default="pipeline_313")
shared_mount_base_ai_job = os.environ.get("shared_mount_base_ai_job",
_config.get("SYSTEM_CONFIG", {}).get('shared_mount_base_ai_job'))
# read from $shared_mount_base_ai_job/$pipeline_id/run.config
# run_id_path = shared_mount_base_ai_job + "/" + pipeline_id + "/run_config.json"
run_id_path = os.path.join(shared_mount_base_ai_job, pipeline_id, "run_config.json")
try:
sys.stderr.write("Checking for run id parameters at path " + run_id_path + "\n")
with open(run_id_path) as f:
run_id_param = json.load(f)
run_id = run_id_param['run_id']
except Exception as e:
sys.stderr.write("No run_config.json file is there so keeping run id as default_run_id " + "\n")
run_id = _config.get("SYSTEM_CONFIG", {}).get('default_run_id')
# $shared_mount_base_ai_job/$pipeline_id/$run_id/PreprocessWeatherData/input/$artifact_Name component_output_dir =
# shared_mount_base_ai_job + "/" + pipeline_id + "/" + run_id + "/" + GetDataFromStoreConstants.NEXT_COMPONENT +
# "/input"
component_parameter_path = os.path.join(shared_mount_base_ai_job, pipeline_id, run_id,
ModelObjectConstants.COMPONENT_NAME, "param.json")
try:
sys.stderr.write("Checking for component parameters at path " + component_parameter_path + "\n")
with open(component_parameter_path) as f:
component_parameter = json.load(f)
n_estimators = int(component_parameter['n_estimators'])
criterion = component_parameter['criterion']
max_depth = int(component_parameter['max_depth'])
except Exception as e:
sys.stderr.write("No param.json file so trying to take from env" + "\n")
try:
n_estimators = int(os.environ.get('n_estimators'))
criterion = os.environ.get('criterion')
max_depth = int(os.environ.get('max_depth'))
except Exception as e:
raise KeyError(e)
component_output_dir = os.path.join(shared_mount_base_ai_job, pipeline_id, run_id,
ModelObjectConstants.NEXT_COMPONENT, "input")
component_input_dir = os.path.join(shared_mount_base_ai_job, pipeline_id, run_id,
ModelObjectConstants.COMPONENT_NAME, "input")
model_name = os.environ.get('model_name', default="randomforest").lower()
BASE_LOG_PATH = os.path.join(os.getcwd(), "logs")
if not os.path.exists(os.path.join(os.getcwd(), 'logs')):
os.mkdir(os.path.join(os.getcwd(), 'logs'))
LOG_LEVEL = os.environ.get("LOG_LEVEL", _config.get('SERVICE_CONFIG', {}).get("LOG_LEVEL", "INFO")).upper()
LOG_HANDLER_NAME = _config.get('SERVICE_CONFIG', {}).get("LOG_HANDLER_NAME", "RandomForest")
ENABLE_LOGSTASH_LOG = os.environ.get("ENABLE_LOGSTASH_LOG", 'False').lower()
LOGSTASH_HOST = _config.get('SERVICE_CONFIG', {}).get('LOGSTASH_HOST')
LOGSTASH_PORT = str(_config.get('SERVICE_CONFIG', {}).get('LOGSTASH_PORT'))
config = {
'pipeline_id': pipeline_id,
'run_id': run_id,
'shared_mount_base_ai_job': shared_mount_base_ai_job,
'component_output_dir': component_output_dir,
'model_name': model_name,
'bootstrap': bootstrap,
'max_features': max_features,
'min_samples_leaf': min_samples_leaf,
'min_samples_split': min_samples_split,
'n_estimators': n_estimators,
'criterion': criterion,
'max_depth': max_depth
}
if not os.path.exists(config['shared_mount_base_ai_job']):
sys.stderr.write("Shared path does not exist!" + "\n")
sys.stderr.write("Creating path --> {}".format(config['shared_mount_base_ai_job'] + "\n"))
os.makedirs(config['shared_mount_base_ai_job'])
if not os.path.exists(config['component_output_dir']):
sys.stderr.write("component_output_dir does not exist!" + "\n")
sys.stderr.write("Creating path --> {}".format(config['component_output_dir'] + "\n"))
os.makedirs(config['component_output_dir'])
#!/usr/bin/env python
class ModelObjectConstants:
NEXT_COMPONENT = "ModelFitting"
COMPONENT_NAME = "RandomForest"
HTTP = "http://"
RANDOMFOREST = "randomforest"
LOG_VAR_MESSAGE = "\n" + "#" * 25 + "\n" + "{}" + "\n" + "#" * 25 + "\n" + "{}\n"
class ComponentExceptions:
INVALID_AZURE_FILE_PATH_EXCEPTION = "AZURE PATH ERROR"
INVALID_LOCAL_FILE_PATH_EXCEPTION = "No File in the local path"
INVALID_ARTIFACT_BASE_PATH_EXCEPTION = "Artifact base path value is missing"
INVALID_AZURE_FILE_NAME_EXCEPTION = "Artifact name is missing"
INVALID_CONTAINER_NAME = "Container name is missing"
import os
import logging
from logging.handlers import RotatingFileHandler
from logstash_async.handler import AsynchronousLogstashHandler
from scripts.common.config_parser import LOG_LEVEL, LOG_HANDLER_NAME, BASE_LOG_PATH
from scripts.common.config_parser import LOG_LEVEL, LOG_HANDLER_NAME, BASE_LOG_PATH, LOGSTASH_HOST, LOGSTASH_PORT, ENABLE_LOGSTASH_LOG
DEFAULT_FORMAT = '%(asctime)s %(levelname)5s %(name)s %(message)s'
DEBUG_FORMAT = '%(asctime)s %(levelname)5s %(name)s [%(threadName)5s:%(filename)5s:%(funcName)5s():%(lineno)s] %(' \
'message)s '
EXTRA = {}
FORMATTER = DEFAULT_FORMAT
if LOG_LEVEL.strip() == "DEBUG":
FORMATTER = DEBUG_FORMAT
logging.trace = logging.DEBUG - 5
logging.addLevelName(logging.DEBUG - 5, 'TRACE')
class ILensLogger(logging.getLoggerClass()):
def __init__(self, name):
super().__init__(name)
def trace(self, msg, *args, **kwargs):
if self.isEnabledFor(logging.trace):
self._log(logging.trace, msg, args, **kwargs)
def get_logger(log_handler_name):
"""
Purpose : To create logger .
:param log_handler_name: Name of the log handler.
:return: logger object.
"""
log_path = os.path.join(BASE_LOG_PATH, log_handler_name + ".log")
logging.setLoggerClass(ILensLogger)
_logger = logging.getLogger(log_handler_name)
_logger.setLevel(LOG_LEVEL.strip().upper())
log_handler = logging.StreamHandler()
log_handler.setLevel(LOG_LEVEL)
formatter = logging.Formatter(FORMATTER)
log_handler.setFormatter(formatter)
handler = RotatingFileHandler(log_path, maxBytes=10485760,
backupCount=5)
handler.setFormatter(formatter)
_logger.addHandler(log_handler)
_logger.addHandler(handler)
if ENABLE_LOGSTASH_LOG == 'true' and LOGSTASH_PORT is not None and LOGSTASH_HOST is not None and LOGSTASH_PORT.isdigit():
_logger.addHandler(AsynchronousLogstashHandler(LOGSTASH_HOST, int(LOGSTASH_PORT), database_path=None))
return _logger
logger = get_logger(LOG_HANDLER_NAME)
{
"n_estimators": 900,
"criterion": "mse",
"max_depth": 100
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment