Commit a752455a authored by dasharatha.vamshi's avatar dasharatha.vamshi

added code for Fy676A

parent 402087d1
# Created by .ignore support plugin (hsz.mobi)
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
reports/*.pdf
reports/*.csv
reports/*.xlsx
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
.idea
logs
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
### VisualStudio template
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
# User-specific files
*.rsuser
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Mono auto generated files
mono_crash.*
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
[Ww][Ii][Nn]32/
[Aa][Rr][Mm]/
[Aa][Rr][Mm]64/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
[Ll]ogs/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUnit
*.VisualState.xml
TestResult.xml
nunit-*.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
# ASP.NET Scaffolding
ScaffoldingReadMe.txt
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_h.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*_wpftmp.csproj
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Coverlet is a free, cross platform Code Coverage Tool
coverage*.json
coverage*.xml
coverage*.info
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# NuGet Symbol Packages
*.snupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
*.appxbundle
*.appxupload
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!?*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
*- [Bb]ackup.rdl
*- [Bb]ackup ([0-9]).rdl
*- [Bb]ackup ([0-9][0-9]).rdl
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# CodeRush personal settings
.cr/personal
# Python Tools for Visual Studio (PTVS)
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/
# Local History for Visual Studio
.localhistory/
# BeatPulse healthcheck temp database
healthchecksdb
# Backup folder for Package Reference Convert tool in Visual Studio 2017
MigrationBackup/
# Ionide (cross platform F# VS Code tools) working folder
.ionide/
# Fody - auto-generated XML schema
FodyWeavers.xsd
### JupyterNotebooks template
# gitignore template for Jupyter Notebooks
# website: http://jupyter.org/
.env
import warnings
from loguru import logger
import pandas as pd
from scripts.constants.constants import RawConstants
from scripts.section_utils.bof_section import preprocess_bof_section
from scripts.section_utils.extruder_section import preprocess_extruder_section
from scripts.section_utils.material_section import preprocess_viscosity_section
from scripts.section_utils.mixer_section import preprocess_mixer_section
from scripts.section_utils.pickup_section import preprocess_pickup_section
from scripts.section_utils.sheet_supply_section import preprocess_sheet_section
warnings.filterwarnings("ignore")
def read_raw_data(raw_path, raw_skip_rows):
df = pd.read_excel(raw_path, skiprows=raw_skip_rows)
if len(df.columns) == len(RawConstants.columns):
logger.info(f"Total cols are {len(RawConstants.columns)} and are same as the df cols length")
df.columns = RawConstants.columns
else:
missed_cols = RawConstants.columns[len(df.columns):]
logger.info(f"missed cols are {missed_cols}")
for col in missed_cols:
df[col] = float('nan')
df.columns = RawConstants.columns
logger.info(f"Shape of df is {df.shape}")
return df
def start_prediction(raw_path, viscosity_path, index_no, raw_skip_rows, viscosity_skip_rows):
logger.info("Reading raw file data")
df = read_raw_data(raw_path, raw_skip_rows)
logger.info(f"Shape of raw df is {df.shape}")
logger.info("Starting preprocessing material section")
viscosity_df, raw_viscosity_df = preprocess_viscosity_section(viscosity_path, index_no, viscosity_skip_rows)
viscosity_df.to_csv('viscosity-agg.csv')
logger.info(f"The shape of the viscosity df is {viscosity_df.shape}")
logger.info("Completed material section preprocessing")
logger.info("Starting preprocessing sheet section")
df_sheet_grouped = preprocess_sheet_section(df, index_no)
logger.info(f"The shape of the Sheet df is {df_sheet_grouped.shape}")
logger.info("Completed sheet section preprocessing")
df_sheet_grouped.to_csv('sheet-agg.csv')
logger.info("Starting preprocessing mixer section")
df_mixer_grouped = preprocess_mixer_section(df, index_no)
logger.info(f"The shape of the Mixer df is {df_mixer_grouped.shape}")
logger.info("Completed mixer section preprocessing")
df_mixer_grouped.to_csv('mixer-agg.csv')
logger.info("Starting preprocessing extruder section")
df_extruder_grouped = preprocess_extruder_section(df, index_no, raw_viscosity_df)
logger.info(f"The shape of the Extruder df is {df_extruder_grouped.shape}")
logger.info("Completed extruder section preprocessing")
df_extruder_grouped.to_csv('extruder-agg.csv')
logger.info("Starting preprocessing bof section")
df_bof_grouped = preprocess_bof_section(df, index_no, raw_viscosity_df)
logger.info(f"The shape of the BOF df is {df_bof_grouped.shape}")
logger.info("Completed bof section preprocessing")
df_bof_grouped.to_csv('bof-agg.csv')
bof_desc = df_bof_grouped.describe()
bof_desc.to_csv('bof-describe.csv')
logger.info("Starting preprocessing pickup section")
df_pickup_grouped = preprocess_pickup_section(df, index_no, raw_viscosity_df)
logger.info(f"The shape of the Extruder df is {df_pickup_grouped.shape}")
logger.info("Completed pickup section preprocessing")
df_pickup_grouped.to_csv('pickup-agg.csv')
df = pd.read_csv('pickup-agg.csv')
print(df.describe())
if __name__ == "__main__":
try:
logger.info("Starting the model")
index_number = 1250
raw_file_path = 'FY676-A-WO_Visc.xlsx'
raw_file_skip_rows = 0
viscosity_file_path = 'viscosity_natural_rubber_data.xlsx'
viscosity_file_skip_rows = 3
start_prediction(raw_file_path, viscosity_file_path, index_number, raw_file_skip_rows, viscosity_file_skip_rows)
except Exception as e:
logger.exception(f"Module failed because of error {e}")
loguru==0.5.3
numpy==1.22.3
openpyxl==3.1.2
pandas==1.5.3
mlflow==1.20.2
protobuf==3.20.1
scikit-learn==1.2.2
loguru==0.5.3
mlflow==1.20.2
numpy==1.22.3
openpyxl==3.1.2
pandas==1.5.3
pytz==2023.3.post1
requests==2.31.0
scikit-base==0.5.2
scikit-learn==1.2.2
scikit-plot==0.3.7
scipy==1.10.1
six==1.16.0
statsmodels==0.14.1
xlrd==2.0.1
class RawConstants:
columns = ['Time Stamp',
'Shipper size No.',
'Shipper No.1 DH',
'Shipper No.1 Pallet',
'Shipper No.3 DH',
'Shipper No.2 Pallet',
'Shipper No.3 DH.1',
'Shipper No.3 Pallet',
'Size No (INDEX No)',
'Weighing times',
'Process mass',
'Mass',
'Material detection',
'Surface temperature (mixer side)',
'Surface temperature (center)',
'Surface temperature (receiving side)',
'temperature',
'humidity',
'Weighing command No.',
'spare',
'spare.1',
'spare.2',
'spare.3',
'spare.4',
'Size No (INDEX No).1',
'Weighing times.1',
'Process mass.1',
'real mass',
'spare.5',
'spare.6',
'spare.7',
'Size No (INDEX No).2',
'Weighing times.2',
'Process mass.2',
'CB weighing machine measurement',
'Dust collection duct (Immediately after ****)',
'Dust collection duct (before dust collector)\n',
'CB slot open',
'CB slot closed',
'carbon cycle',
'carbon2 cycle',
'spare.8',
'spare.9',
'spare.10',
'spare.11',
'Size No (INDEX No).3',
'Size name',
'Mixing batch number',
'Mixing Weight (Integrated Value)',
'Rotor actual rpm',
'Mixing timer value',
'Temperature (DS side)',
'Temperature (WS side)',
'Electric power',
'Electric energy',
'Mixing electric power average',
'Ram pressure',
'Ram rising',
'Ram down',
'Ram position',
'front door open',
'Front door closed',
'lower door open',
'lower door closed',
'Before mixer rotation detection',
'After mixer rotation detection',
'Drilled side left Inlet side Cooling water temperature',
'Drilled side left Exit side Cooling water temperature',
'Drilled side right Inlet side Cooling water temperature',
'Drilled side right Exit side Cooling water temperature',
'Mixer rotor left inlet side Coolant temperature',
'Mixer rotor left output side Cooling water temperature',
'Mixer rotor right inlet side Coolant temperature',
'Mixer rotor right exit side Cooling water temperature',
'Mixer body temperature',
'Drilled side left Inlet side Cooling water flow rate',
'Drilled side left Exit side Cooling water flow rate',
'Drilled side right Inlet side Cooling water flow rate',
'Drilled side right Exit side Cooling water flow rate',
'Mixer rotor left inlet side Cooling water flow rate',
'Mixer rotor left outlet side Cooling water flow rate',
'Mixer rotor right inlet side Cooling water flow rate',
'Mixer rotor right outlet side Cooling water flow rate',
'temperature.1',
'humidity.1',
'idle time between batches',
'spare.12',
'spare.13',
'spare.14',
'spare.15',
'spare.16',
'spare.17',
'spare.18',
'Size No (INDEX No).4',
'discharge length',
'Hopper bank upper limit',
'middle of hopper bank',
'Hopper bank lower limit',
'Hopper bank below lower limit',
'Extruder rpm',
'Extruder current',
'Calendar rpm',
' Calendar current',
'Calendar bank load',
'Calendar GAP Operation side',
'Calendar GAP Opposite operation side',
'Residence time',
'Screw operation side Inlet side Cooling water temperature',
'Screw operation side Outlet side Cooling water temperature',
'Screw Opposite operation side Inlet side Cooling water temperature',
'Screw Opposite operation side Outlet side Cooling water temperature',
'Calender roll Lower side Inlet side Cooling water temperature',
'Calender roll Lower side Outlet side Cooling water temperature',
'Calender roll upper side Inlet side Cooling water temperature',
'Calender roll Upper side Outlet side Cooling water temperature',
'Screw operation side Inlet side Cooling water flow rate',
'Screw operation side Outlet side Cooling water flow rate',
'Screw Opposite operation side Inlet side Cooling water flow rate',
'Screw Opposite operation side Outlet side Cooling water flow rate',
'Calender roll Lower side Inlet side Cooling water flow rate',
'Calender roll Lower side Outlet side Cooling water flow rate',
'Calender roll upper side Inlet side Cooling water flow rate',
'Calender roll Upper side Outlet side Cooling water flow rate',
'Extruder body temperature',
'spare.19',
'spare.20',
'spare.21',
'spare.22',
'spare.23',
'spare.24',
'spare.25',
'Size No (INDEX No).5',
'length passed through',
'Material detection.1',
'Sheet temperature immediately after calendering',
'Withdrawal CV speed',
'DUST CV\nspeed',
'spare.26',
'spare.27',
'spare.28',
'Size No (INDEX No).6',
'length passed through.1',
'Material detection.2',
'Seat temperature immediately after BOF',
'temperature.2',
'humidity.2',
'spare.29',
'spare.30',
'spare.31',
'spare.32',
'Size No (INDEX No).7',
'Setting length',
'length passed through(Integrated Value)',
'Mass\n(Integrated Value)',
'Pallet No.',
'Loading completion flag',
'spare.33',
'spare.34',
'spare.35',
'spare.36',
'mixer cooling water',
'Under cooling water']
class ViscosityConstants:
rubber_cols = [
'Quantity using type1 bale',
'PO_type1',
'DIRT_type1',
'ASH_type1',
'VM_type1',
'PRI_type1',
'NITROGEN_type1',
'Temperature during transportation_type1[℃]',
'Humidity during transportation_type1[%]',
'Quantity using type2 bale',
'PO_type2',
'DIRT_type1.1',
'ASH_type2',
'VM_type2',
'PRI_type2',
'NITROGEN_type2',
'Temperature during transportation_type2[℃]',
'Humidity during transportation__type2[%]'
]
req_cols = [
'Rubber No.', 'Batch No.', 'Index No',
'Input rubber weight(0.1kg)', 'date', 'batch-date',
'Weight_type1', 'Weight_type2', 'Weighted_PO_type',
'Weighted_DIRT_type', 'Weighted_ASH_type', 'Weighted_VM_type',
'Weighted_PRI_type', 'Weighted_NITROGEN_type',
'Weighted_Temperature during transportation_type[℃]',
'Weighted_Humidity during transportation__type[%]', 'Weighted Sum', 'viscosity']
class SheetConstants:
sheet_supply_column = ['Time Stamp',
'Shipper size No.',
'Shipper No.1 DH',
'Shipper No.1 Pallet',
'Shipper No.3 DH',
'Shipper No.2 Pallet',
'Shipper No.3 DH.1',
'Shipper No.3 Pallet',
'Size No (INDEX No)',
'Weighing times',
'Process mass',
'Mass',
'Material detection',
'Surface temperature (mixer side)',
'Surface temperature (center)',
'Surface temperature (receiving side)',
'temperature',
'humidity',
'Weighing command No.',
'spare',
'spare.1',
'spare.2',
'spare.3',
'spare.4']
aggregation_dict = {
"Surface temperature (mixer side)": "mean",
"Surface temperature (center)": "std",
"Surface temperature (receiving side)": "mean",
"temperature": "mean",
"humidity": "mean",
'Process mass': 'mean',
}
class MixerConstants:
mixer_cols = ['Time Stamp',
'Size No (INDEX No).3',
'Size name',
'Mixing batch number',
'Mixing Weight (Integrated Value)',
'Rotor actual rpm',
'Mixing timer value',
'Temperature (DS side)',
'Temperature (WS side)',
'Electric power',
'Electric energy',
'Mixing electric power average',
'Ram pressure',
'Ram rising',
'Ram down',
'Ram position',
'front door open',
'Front door closed',
'lower door open',
'lower door closed',
'Before mixer rotation detection',
'After mixer rotation detection',
'Drilled side left Inlet side Cooling water temperature',
'Drilled side left Exit side Cooling water temperature',
'Drilled side right Inlet side Cooling water temperature',
'Drilled side right Exit side Cooling water temperature',
'Mixer rotor left inlet side Coolant temperature',
'Mixer rotor left output side Cooling water temperature',
'Mixer rotor right inlet side Coolant temperature',
'Mixer rotor right exit side Cooling water temperature',
'Mixer body temperature',
'Drilled side left Inlet side Cooling water flow rate',
'Drilled side left Exit side Cooling water flow rate',
'Drilled side right Inlet side Cooling water flow rate',
'Drilled side right Exit side Cooling water flow rate',
'Mixer rotor left inlet side Cooling water flow rate',
'Mixer rotor left outlet side Cooling water flow rate',
'Mixer rotor right inlet side Cooling water flow rate',
'Mixer rotor right outlet side Cooling water flow rate',
'temperature.1',
'humidity.1',
'idle time between batches',
]
aggregation_dict = {
'Mixing timer value': 'max',
'Temperature (DS side)': 'mean',
'Temperature (WS side)': 'std',
'Electric power': 'mean',
'Electric energy': 'mean',
'Mixing electric power average': 'mean',
'Ram pressure': 'mean',
# 'Ram rising': '',
# 'Ram down': '',
'Ram position': 'std',
# 'front door open': '',
# 'Front door closed': '',
# 'lower door open': '',
# 'lower door closed': '',
# 'Before mixer rotation detection': '',
# 'After mixer rotation detection': '',
'Drilled side left Inlet side Cooling water temperature': 'std',
'Drilled side left Exit side Cooling water temperature': 'mean', #
'Drilled side right Inlet side Cooling water temperature': 'mean',
'Drilled side right Exit side Cooling water temperature': 'std',
'Mixer rotor left inlet side Coolant temperature': 'std',
'Mixer rotor left output side Cooling water temperature': 'mean',
'Mixer rotor right inlet side Coolant temperature': 'mean',
'Mixer rotor right exit side Cooling water temperature': 'std',
'Mixer body temperature': 'mean',
'Drilled side left Inlet side Cooling water flow rate': 'std',
'Drilled side left Exit side Cooling water flow rate': 'mean', #
'Drilled side right Inlet side Cooling water flow rate': 'mean',
'Drilled side right Exit side Cooling water flow rate': 'std', #
'Mixer rotor left inlet side Cooling water flow rate': 'std',
'Mixer rotor left outlet side Cooling water flow rate': 'mean',
'Mixer rotor right inlet side Cooling water flow rate': 'mean',
'Mixer rotor right outlet side Cooling water flow rate': 'std',
'temperature.1': 'mean',
'humidity.1': 'mean',
'idle time between batches': 'mean',
'Mixing Weight (Integrated Value)_diff': 'max', # any agg will work
'max_rpm_count': 'max' # any agg will work
}
class ExtruderConstants:
extruder_cols = ['Size No (INDEX No).4',
'discharge length',
'Hopper bank upper limit',
'middle of hopper bank',
'Hopper bank lower limit',
'Hopper bank below lower limit',
'Extruder rpm',
'Extruder current',
'Calendar rpm',
' Calendar current',
'Calendar bank load',
'Calendar GAP Operation side',
'Calendar GAP Opposite operation side',
'Residence time',
'Screw operation side Inlet side Cooling water temperature',
'Screw operation side Outlet side Cooling water temperature',
'Screw Opposite operation side Inlet side Cooling water temperature',
'Screw Opposite operation side Outlet side Cooling water temperature',
'Calender roll Lower side Inlet side Cooling water temperature',
'Calender roll Lower side Outlet side Cooling water temperature',
'Calender roll upper side Inlet side Cooling water temperature',
'Calender roll Upper side Outlet side Cooling water temperature',
'Screw operation side Inlet side Cooling water flow rate',
'Screw operation side Outlet side Cooling water flow rate',
'Screw Opposite operation side Inlet side Cooling water flow rate',
'Screw Opposite operation side Outlet side Cooling water flow rate',
'Calender roll Lower side Inlet side Cooling water flow rate',
'Calender roll Lower side Outlet side Cooling water flow rate',
'Calender roll upper side Inlet side Cooling water flow rate',
'Calender roll Upper side Outlet side Cooling water flow rate',
'Extruder body temperature',
'spare.19',
'spare.20',
'spare.21',
'spare.22',
'spare.23',
'spare.24',
'spare.25']
aggregate_dict = {
'discharge length': "max",
'Extruder rpm': "mean",
'Extruder current': "std",
'Calendar rpm': "std",
' Calendar current': "mean",
'Calendar bank load': "max",
'Calendar GAP Operation side': "median",
'Calendar GAP Opposite operation side': "std",
'Residence time': "max",
'Screw operation side Inlet side Cooling water temperature': "mean",
'Screw operation side Outlet side Cooling water temperature': "std",
'Screw Opposite operation side Inlet side Cooling water temperature': "mean",
'Screw Opposite operation side Outlet side Cooling water temperature': "std",
'Calender roll Lower side Inlet side Cooling water temperature': "mean",
'Calender roll Lower side Outlet side Cooling water temperature': "std",
'Calender roll upper side Inlet side Cooling water temperature': "mean",
'Calender roll Upper side Outlet side Cooling water temperature': "std",
'Screw operation side Inlet side Cooling water flow rate': "mean",
'Screw operation side Outlet side Cooling water flow rate': "std",
'Screw Opposite operation side Inlet side Cooling water flow rate': "mean",
'Screw Opposite operation side Outlet side Cooling water flow rate': "std",
'Calender roll Lower side Inlet side Cooling water flow rate': "mean",
'Calender roll Lower side Outlet side Cooling water flow rate': "std",
'Calender roll upper side Inlet side Cooling water flow rate': "mean",
'Calender roll Upper side Outlet side Cooling water flow rate': "std",
'Extruder body temperature': "mean"
}
class PickupConstants:
pick_cols = ['Size No (INDEX No).6',
'length passed through.1',
'Material detection.2',
'Seat temperature immediately after BOF',
'temperature.2',
'humidity.2',
'spare.29',
'spare.30',
'spare.31',
'spare.32']
pick_imp_mixer_cols = ['Time Stamp',
'Size No (INDEX No).3',
'Size name',
'Mixing batch number',
'idle time between batches',
]
pick_imp_bof_cols = ['Time Stamp',
'Size No (INDEX No).5',
'bof_batch_number'
]
pick_additional_cols = ['day',
'Time Stamp',
'length passed through',
'discharge length']
pick_aggregate_dict = {'Seat temperature immediately after BOF': 'mean', 'viscosity': 'mean'}
pick_grouped_cols = ['batch-date']
class BofConstants:
bof_cols = ['Size No (INDEX No).5',
'length passed through',
'Material detection.1',
'Sheet temperature immediately after calendering',
'Withdrawal CV speed',
'DUST CV\nspeed', 'spare.26',
'spare.27',
'spare.28', 'lower door open']
bof_add_cols = ['Time Stamp', 'day', 'lower door open']
bof_mixer_cols = ['Time Stamp',
'Size No (INDEX No).3',
'Size name',
'Mixing batch number',
'idle time between batches']
bof_aggregate_dict = aggregate_dict = {'Sheet temperature immediately after calendering': 'mean',
'Withdrawal CV speed': 'mean',
'DUST CV\nspeed': 'std'}
\ No newline at end of file
import warnings
from loguru import logger
import mlflow
warnings.filterwarnings("ignore")
class ModelLoader(object):
def __init__(self, model_info):
self.model_info = model_info
def load_model(self):
logger.info("Loading the Model")
if self.model_info["type"] == "mlflow.sklearn":
return self._load_mlflow_sklearn_model()
else:
logger.info("Unsupported Model Type")
def _load_mlflow_sklearn_model(self):
try:
_model = mlflow.sklearn.load_model(self.model_info["path"])
logger.debug("Model loaded successfully!")
return _model
except Exception as e:
logger.error("Error while loading mlflow.sklearn model : {}".format(str(e)))
import warnings
import mlflow
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from scripts.core.model_loader import ModelLoader
from scripts.section_utils.mlflow_util import ModelLoaderSaver
warnings.filterwarnings("ignore")
def model_trainer():
sheet_df = pd.read_csv(r'D:\kalypso\bsj-model-inference\sheet-agg.csv')
mixer_df = pd.read_csv(r'D:\kalypso\bsj-model-inference\mixer-agg.csv')
extruder_df = pd.read_csv(r'D:\kalypso\bsj-model-inference\extruder-agg.csv')
bof_df = pd.read_csv(r'D:\kalypso\bsj-model-inference\bof-agg.csv')
pickup_df = pd.read_csv(r'D:\kalypso\bsj-model-inference\pickup-agg.csv')
viscosity_df = pd.read_csv(r'D:\kalypso\bsj-model-inference\viscosity-agg.csv')
# viscosity_df = viscosity_df[['batch-date', 'viscosity']]
merged_df = pd.merge(sheet_df, mixer_df, on='batch-date', how='left')
merged_df = pd.merge(merged_df, extruder_df, on='batch-date', how='left')
merged_df = pd.merge(merged_df, bof_df, on='batch-date', how='left')
merged_df = pd.merge(merged_df, pickup_df, on='batch-date', how='left')
df_grouped = pd.merge(merged_df, viscosity_df, on='batch-date', how='left')
print(df_grouped.columns)
selected_cols = df_grouped.columns
df_grouped = df_grouped[selected_cols]
# Extract batch number and date
batch_number = df_grouped['batch-date'].str.extract(r'Batch_(\d+\.\d+)_')[0].astype(float)
date = pd.to_datetime(df_grouped['batch-date'].str.extract(r'_(\d{4}-\d{2}-\d{2})$')[0])
# Add extracted data as separate columns
df_grouped['Batch Number'] = batch_number
df_grouped['Date'] = date
# Sort by 'Batch Number' and 'Date'
df_grouped = df_grouped.sort_values(by=['Date', 'Batch Number'])
df_grouped = pd.read_csv(r"D:\kalypso\bsj-model-inference\test-agg-data.csv")
df_grouped = round(df_grouped, 6)
df_grouped.to_csv('grouped.csv')
cols_x = ['temperature_ws_side_std', 'Weighted_VM_type', 'electric_energy_mean', 'calender_roll_upper_side_inlet_side_cooling_water_temperature_mean', '_calendar_current_mean', 'Weighted_NITROGEN_type', 'ram_pressure_mean', 'seat_temperature_immediately_after_bof_mean', 'surface_temperature_center_std', 'screw_operation_side_outlet_side_cooling_water_flow_rate_std', 'Weighted_DIRT_type', 'drilled_side_left_exit_side_cooling_water_temperature_mean', 'sheet_temperature_immediately_after_calendering_mean', 'calender_roll_lower_side_inlet_side_cooling_water_temperature_mean', 'temperature_mean', 'calender_roll_lower_side_inlet_side_cooling_water_flow_rate_mean', 'screw_opposite_operation_side_outlet_side_cooling_water_temperature_std', 'temperature_ds_side_mean', 'Weighted_PRI_type', 'residence_time_max']
cols_y = "viscosity"
req_cols = cols_x + ['viscosity']
# df_grouped = round(df_grouped, 2)
features = df_grouped[cols_x]
# print(features.info())
# print(features.describe().to_csv('feature.csv'))
# print(df_grouped[req_cols].isnull().sum())
# df_grouped = round(df_grouped,2)
# df_grouped = pd.read_csv(r'D:\kalypso\bsj-model-inference\test-gr.csv')
labels = df_grouped[cols_y]
# print(df_grouped[cols_y].describe())
df_grouped[req_cols].to_csv('final.csv')
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, labels, random_state=42, test_size=0.25)
print(f'x_train shape - {x_train.shape}')
print(f'x_test shape - {x_test.shape}')
print(f'y_train shape - {y_train.shape}')
print(f'y_test shape - {y_test.shape}')
params = {'bootstrap': False,
'ccp_alpha': 0.0,
'criterion': 'squared_error',
'max_depth': None,
'max_features': 1.0,
'max_leaf_nodes': None,
'max_samples': None,
'min_impurity_decrease': 0.0,
'min_samples_leaf': 1,
'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0,
'n_estimators': 100,
'n_jobs': -1,
'oob_score': False,
'random_state': 123,
'verbose': 0,
'warm_start': False}
model = ExtraTreesRegressor(**params)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
predictions = [round(value, 2) for value in y_pred]
metric_dictionary = dict()
mae = metrics.mean_absolute_error(y_test, predictions)
mse = metrics.mean_squared_error(y_test, predictions)
mape = metrics.mean_absolute_percentage_error(y_test, predictions)
explained_variance_score = metrics.explained_variance_score(y_test, predictions)
max_error = metrics.max_error(y_test, predictions)
r2_score = metrics.r2_score(y_test, predictions)
median_absolute_error = metrics.median_absolute_error(y_test, predictions)
mean_poisson_deviance = metrics.mean_poisson_deviance(y_test, predictions)
mean_gamma_deviance = metrics.mean_gamma_deviance(y_test, predictions)
metric_dictionary["Mean Absolute Error (MAE)"] = mae
metric_dictionary["Mean Squared Error (MSE)"] = mse
metric_dictionary["Root Mean Squared Error (RMSE)"] = np.sqrt(mse)
metric_dictionary["Mean Absolute Percentage Error (MAPE)"] = mape
metric_dictionary["Explained Variance Score"] = explained_variance_score
metric_dictionary["Max Error"] = max_error
metric_dictionary["Median Absolute Error"] = median_absolute_error
metric_dictionary["R2 Score"] = r2_score
metric_dictionary["Mean Gamma Deviance"] = mean_gamma_deviance
metric_dictionary["Mean Poisson Deviance"] = mean_poisson_deviance
print(metric_dictionary)
experiment_name = "BSJ-Models"
parent_run_name = model_save_name = model_type = "fy676a"
list_of_models = ['rf', 'xgboost', 'lr']
obj = ModelLoaderSaver(None, metric_dictionary, params, experiment_name, parent_run_name, model_save_name, model_type)
new_model = obj.get_latest_model()
y_pred = new_model.predict(x_test)
predictions = [round(value, 2) for value in y_pred]
metric_dictionary = dict()
mae = metrics.mean_absolute_error(y_test, predictions)
mse = metrics.mean_squared_error(y_test, predictions)
mape = metrics.mean_absolute_percentage_error(y_test, predictions)
explained_variance_score = metrics.explained_variance_score(y_test, predictions)
max_error = metrics.max_error(y_test, predictions)
r2_score = metrics.r2_score(y_test, predictions)
median_absolute_error = metrics.median_absolute_error(y_test, predictions)
mean_poisson_deviance = metrics.mean_poisson_deviance(y_test, predictions)
mean_gamma_deviance = metrics.mean_gamma_deviance(y_test, predictions)
metric_dictionary["Mean Absolute Error (MAE)"] = mae
metric_dictionary["Mean Squared Error (MSE)"] = mse
metric_dictionary["Root Mean Squared Error (RMSE)"] = np.sqrt(mse)
metric_dictionary["Mean Absolute Percentage Error (MAPE)"] = mape
metric_dictionary["Explained Variance Score"] = explained_variance_score
metric_dictionary["Max Error"] = max_error
metric_dictionary["Median Absolute Error"] = median_absolute_error
metric_dictionary["R2 Score"] = r2_score
metric_dictionary["Mean Gamma Deviance"] = mean_gamma_deviance
metric_dictionary["Mean Poisson Deviance"] = mean_poisson_deviance
print(metric_dictionary)
# mlflow.sklearn.save_model(new_model, "models/fy676a")
saved_model = ModelLoader({
"type": "mlflow.sklearn",
"path": "models/fy676a"
}).load_model()
y_pred = saved_model.predict(x_test)
predictions = [round(value, 2) for value in y_pred]
metric_dictionary = dict()
mae = metrics.mean_absolute_error(y_test, predictions)
mse = metrics.mean_squared_error(y_test, predictions)
mape = metrics.mean_absolute_percentage_error(y_test, predictions)
explained_variance_score = metrics.explained_variance_score(y_test, predictions)
max_error = metrics.max_error(y_test, predictions)
r2_score = metrics.r2_score(y_test, predictions)
median_absolute_error = metrics.median_absolute_error(y_test, predictions)
mean_poisson_deviance = metrics.mean_poisson_deviance(y_test, predictions)
mean_gamma_deviance = metrics.mean_gamma_deviance(y_test, predictions)
metric_dictionary["Mean Absolute Error (MAE)"] = mae
metric_dictionary["Mean Squared Error (MSE)"] = mse
metric_dictionary["Root Mean Squared Error (RMSE)"] = np.sqrt(mse)
metric_dictionary["Mean Absolute Percentage Error (MAPE)"] = mape
metric_dictionary["Explained Variance Score"] = explained_variance_score
metric_dictionary["Max Error"] = max_error
metric_dictionary["Median Absolute Error"] = median_absolute_error
metric_dictionary["R2 Score"] = r2_score
metric_dictionary["Mean Gamma Deviance"] = mean_gamma_deviance
metric_dictionary["Mean Poisson Deviance"] = mean_poisson_deviance
print(metric_dictionary)
model_trainer()
flavors:
python_function:
env: conda.yaml
loader_module: mlflow.sklearn
model_path: model.pkl
python_version: 3.10.13
sklearn:
pickled_model: model.pkl
serialization_format: cloudpickle
sklearn_version: 1.2.2
utc_time_created: '2023-12-18 12:12:42.185881'
channels:
- conda-forge
dependencies:
- python=3.10.13
- pip
- pip:
- mlflow
- cloudpickle==3.0.0
- scikit-learn==1.2.2
name: mlflow-env
mlflow
cloudpickle==3.0.0
scikit-learn==1.2.2
\ No newline at end of file
import math
import warnings
import traceback
from datetime import datetime
import numpy as np
import pandas as pd
from loguru import logger
from scripts.constants.constants import BofConstants
warnings.filterwarnings("ignore")
def mixer_section_start_end_time(raw_df, index_no):
try:
mixer_cols = BofConstants.bof_mixer_cols
mixer_df = raw_df[mixer_cols]
mixer_df['Time Stamp'] = pd.to_datetime(mixer_df['Time Stamp'])
mixer_df = mixer_df.sort_values(by='Time Stamp')
numeric_cols = mixer_df.select_dtypes(include=['int', 'float']).columns
# Convert numeric columns to float
mixer_df[numeric_cols] = mixer_df[numeric_cols].astype(float)
mixer_df['day'] = mixer_df['Time Stamp'].dt.date
mixer_df = mixer_df[mixer_df["Size No (INDEX No).3"] == index_no]
mixer_df = mixer_df[mixer_df["Mixing batch number"] != 0]
mixer_df['time_min'] = mixer_df['Time Stamp']
mixer_df['time_max'] = mixer_df['Time Stamp']
aggregation_dict = {
'time_min': 'min',
'time_max': 'max',
}
group_by = ['day', 'Mixing batch number']
df_mixer_grouped = mixer_df.groupby(group_by).agg(aggregation_dict).reset_index()
df_mixer_grouped['mixer_section_time_diff_second'] = df_mixer_grouped['time_max'] - df_mixer_grouped['time_min']
df_mixer_grouped['mixer_section_time_diff_second'] = df_mixer_grouped[
'mixer_section_time_diff_second'].dt.total_seconds()
df_mixer_grouped['batch-date'] = 'Batch_' + df_mixer_grouped['Mixing batch number'].astype(str) + '_' + \
df_mixer_grouped['day'].astype(str)
date_dict = {}
batch_lis = list(df_mixer_grouped['batch-date'].unique())
for each_bt in batch_lis:
df_nw = df_mixer_grouped[df_mixer_grouped['batch-date'] == each_bt]
date_dict[each_bt] = {"start_time": str(list(df_nw['time_min'])[0]),
'end_time': str(list(df_nw['time_max'])[0])}
return date_dict
except Exception as err:
logger.error(f'Error in fetching mixer batch date dictionary: {str(err)}')
logger.error(traceback.format_exc())
raise Exception(str(err))
def return_batch_no_df(raw_df, viscosity_df, date_dict, index_number):
try:
logger.info('Getting bof batch number')
raw_df['day'] = raw_df['Time Stamp'].dt.date
raw_df['day'] = raw_df['day'].astype('str')
raw_df['Mixing batch number'] = raw_df['Mixing batch number'].astype('float')
raw_df['batch-date'] = 'Batch_' + raw_df['Mixing batch number'].astype(
'str') + '_' + raw_df['day'].astype('str')
bof_add_cols = BofConstants.bof_add_cols
bof_df = raw_df[BofConstants.bof_cols + bof_add_cols]
sorted_bof_df = bof_df.sort_values(by="Time Stamp", ascending=True)
sorted_bof_df = sorted_bof_df[sorted_bof_df['Size No (INDEX No).5'] == index_number]
dt_list = list(sorted_bof_df['day'].unique())
day_length_dic = {}
for each_day in dt_list:
day_df = sorted_bof_df[sorted_bof_df['day'] == each_day]
if (day_df['length passed through'].max() - day_df['length passed through'].min()) <= 0:
value = 0
else:
value = day_df['length passed through'].max() - day_df['length passed through'].min()
day_length_dic[each_day] = value
sorted_viscosity_df = viscosity_df.sort_values(by="Mixing date", ascending=True)
sorted_viscosity_df['day'] = sorted_viscosity_df['Mixing date'].dt.date
sorted_viscosity_df['day'] = sorted_viscosity_df['day'].astype('str')
extrud_visc_df = sorted_viscosity_df[['Batch No.', 'Input rubber weight(0.1kg)', 'day', 'Mixing date']]
extrud_visc_df['length_from_extruder'] = extrud_visc_df['day'].map(day_length_dic)
extrud_visc_df['length_from_extruder'] = extrud_visc_df['length_from_extruder'].fillna(0)
daily_sum_weight = extrud_visc_df.groupby('day')['Input rubber weight(0.1kg)'].sum() / 10
# Add a new column 'm/kg' by dividing 'length_from_extruder' by the sum for each day
extrud_visc_df['m/kg'] = extrud_visc_df.apply(
lambda row: row['length_from_extruder'] / daily_sum_weight[row['day']], axis=1)
extrud_visc_df['batch_length'] = extrud_visc_df.apply(
lambda row: row['m/kg'] * row['Input rubber weight(0.1kg)'] / 10, axis=1).astype('float64')
extrud_visc_df['batch_length'] = extrud_visc_df['batch_length'].apply(math.ceil)
extrud_visc_df['cumulative_length'] = extrud_visc_df.groupby('day')['batch_length'].cumsum()
discharge_dict = extrud_visc_df.groupby('day').apply(
lambda group: group.set_index('Batch No.').to_dict()['cumulative_length']).to_dict()
test_sorted_extr_df = sorted_bof_df
test_df = test_sorted_extr_df
# Initialize an empty list to store batch numbers
batch_numbers = []
# Iterate through each row in the DataFrame
for index, row in test_df.iterrows():
day = row['day']
discharge_length = row['length passed through']
if discharge_length == 0:
batch_numbers.append(0)
else:
# Check if the day is in the dictionary
if day in discharge_dict:
# Check if discharge length is less than or equal to the corresponding batch length
batch_length_dict = discharge_dict[day]
for batch_no, batch_length in batch_length_dict.items():
if discharge_length <= batch_length:
batch_numbers.append(batch_no)
break
else:
# If no match is found in the dictionary, assign NaN to batch number
batch_numbers.append(batch_numbers[-1])
else:
# If day is not in the dictionary, assign NaN to batch number
batch_numbers.append(np.nan)
# Add the 'batch_no' column to the DataFrame
test_df['batch_no'] = batch_numbers
batch_number = 0
batch_list = []
started_with_one = False
current_day = None
for value, day in zip(list(test_df['lower door open']), list(test_df['day'])):
if current_day != day:
current_day = day
batch_number = 0
if value == 1:
if not started_with_one:
batch_number += 1
started_with_one = True
batch_list.append(batch_number)
else:
batch_list.append(batch_number)
started_with_one = False
batch_number = 0
batch_list = []
started_with_one = False
for value in test_df['lower door open']:
if value == 1:
if not started_with_one:
batch_number += 1
started_with_one = True
batch_list.append(batch_number)
else:
batch_list.append(batch_number)
started_with_one = False
test_df['batch_no'] = test_df['batch_no'].astype('float')
test_df['bof_batch_date'] = 'Batch_' + test_df['batch_no'].astype('str') + '_' + test_df['day'].astype(
'str')
extruder_flag_list = []
extrud_flg_vms = []
for i, value in test_df.iterrows():
if value['batch_no'] == 0.0:
extruder_flag_list.append('false')
extrud_flg_vms.append(0)
else:
start_time = date_dict.get(value["bof_batch_date"]).get("start_time")
end_time = date_dict.get(value["bof_batch_date"]).get("end_time")
if (datetime.strptime(str(value["Time Stamp"]).split('+')[0],
'%Y-%m-%d %H:%M:%S') > datetime.strptime(
start_time.split('+')[0], '%Y-%m-%d %H:%M:%S')) & \
(datetime.strptime(str(value["Time Stamp"]).split('+')[0],
'%Y-%m-%d %H:%M:%S') < datetime.strptime(
end_time.split('+')[0], '%Y-%m-%d %H:%M:%S')):
extruder_flag_list.append('true')
extrud_flg_vms.append(1)
else:
extruder_flag_list.append('false')
extrud_flg_vms.append(0)
test_df['bof_flag'] = extruder_flag_list
test_df['bof_batch_diff'] = extrud_flg_vms
# test_df['updated_bt_list'] = batch_list
test_df['bof_batch_number'] = test_df['batch_no'] - test_df['bof_batch_diff'].astype('float')
test_df['batch-date'] = 'Batch_' + test_df['bof_batch_number'].astype(
'str') + '_' + test_df['day'].astype('str')
return test_df
except Exception as er:
logger.error(f'Error in adding batch data to bof section: {str(er)}')
logger.error(traceback.format_exc())
raise Exception(str(er))
def preprocess_bof_section(df, index_number, vis_df):
try:
df['Time Stamp'] = pd.to_datetime(df['Time Stamp'])
df = df.sort_values(by='Time Stamp')
df['day'] = df['Time Stamp'].dt.date
df['day'] = df['day'].astype('str')
date_dict = mixer_section_start_end_time(df, index_number)
bof_merged_df_final = return_batch_no_df(df, vis_df, date_dict, index_number)
bof_merged_df_final = bof_merged_df_final[bof_merged_df_final['bof_batch_number'] != 0]
print(bof_merged_df_final.columns)
grouped_cols = ['batch-date']
aggregate_dict = BofConstants.bof_aggregate_dict
df_bof_grouped = bof_merged_df_final.groupby(grouped_cols).agg(aggregate_dict).reset_index()
col_renamer = {}
for col, col_agg in aggregate_dict.items():
if col not in ['viscosity', 'time_min', 'time_max', 'Mixing Weight (Integrated Value)_diff', 'max_rpm_count']:
renamed_col = f'{col.replace("(", "").replace(")", "").replace(" ", "_")}_{col_agg}'.lower()
col_renamer[col] = renamed_col
else:
col_renamer[col] = col
df_bof_grouped = df_bof_grouped.rename(columns=col_renamer)
df_bof_grouped_rest = df_bof_grouped.drop('batch-date', axis=1)
df_bof_grouped_rest = df_bof_grouped_rest.fillna(df_bof_grouped_rest.mean())
df_bof_grouped_rest = round(df_bof_grouped_rest, 6)
df_bof_grouped_rest['batch-date'] = df_bof_grouped['batch-date']
return df_bof_grouped_rest
except Exception as err:
logger.error(f'Error in fetching the bof preprocess data: {str(err)}')
logger.error(traceback.format_exc())
raise Exception(str(err))
\ No newline at end of file
import math
import warnings
from datetime import datetime
import numpy as np
import pandas as pd
from loguru import logger
from scripts.constants.constants import ExtruderConstants
warnings.filterwarnings("ignore")
def mixer_section_start_end_time(raw_df, index_no):
mixer_cols = ['Time Stamp',
'Size No (INDEX No).3',
'Size name',
'Mixing batch number',
'idle time between batches',
]
mixer_df = raw_df[mixer_cols]
mixer_df['Time Stamp'] = pd.to_datetime(mixer_df['Time Stamp'])
mixer_df = mixer_df.sort_values(by='Time Stamp')
numeric_cols = mixer_df.select_dtypes(include=['int', 'float']).columns
# Convert numeric columns to float
mixer_df[numeric_cols] = mixer_df[numeric_cols].astype(float)
mixer_df['day'] = mixer_df['Time Stamp'].dt.date
mixer_df = mixer_df[mixer_df["Size No (INDEX No).3"] == index_no]
mixer_df = mixer_df[mixer_df["Mixing batch number"] != 0]
mixer_df['time_min'] = mixer_df['Time Stamp']
mixer_df['time_max'] = mixer_df['Time Stamp']
aggregation_dict = {
'time_min': 'min',
'time_max': 'max',
}
group_by = ['day', 'Mixing batch number']
df_mixer_grouped = mixer_df.groupby(group_by).agg(aggregation_dict).reset_index()
df_mixer_grouped['mixer_section_time_diff_second'] = df_mixer_grouped['time_max'] - df_mixer_grouped['time_min']
df_mixer_grouped['mixer_section_time_diff_second'] = df_mixer_grouped[
'mixer_section_time_diff_second'].dt.total_seconds()
df_mixer_grouped['batch-date'] = 'Batch_' + df_mixer_grouped['Mixing batch number'].astype(str) + '_' + \
df_mixer_grouped['day'].astype(str)
date_dict = {}
batch_lis = list(df_mixer_grouped['batch-date'].unique())
for each_bt in batch_lis:
df_nw = df_mixer_grouped[df_mixer_grouped['batch-date'] == each_bt]
date_dict[each_bt] = {"start_time": str(list(df_nw['time_min'])[0]),
'end_time': str(list(df_nw['time_max'])[0])}
return date_dict
def return_batch_no_df(
raw_df, viscosity_df, date_dict, bof_cols, additional_cols, index_no
):
raw_df = raw_df.sort_values(by='Time Stamp')
raw_df['Time Stamp'] = pd.to_datetime(raw_df['Time Stamp'])
raw_df["day"] = raw_df["Time Stamp"].dt.date
raw_df["day"] = raw_df["day"].astype("str")
raw_df["Mixing batch number"] = raw_df["Mixing batch number"].astype("float")
raw_df["batch-date"] = (
"Batch_"
+ raw_df["Mixing batch number"].astype("str")
+ "_"
+ raw_df["day"].astype("str")
)
bof_add_cols = bof_cols + additional_cols
bof_df = raw_df[bof_add_cols]
sorted_bof_df = bof_df.sort_values(by="Time Stamp", ascending=True)
sorted_bof_df = sorted_bof_df[sorted_bof_df["Size No (INDEX No).4"] == index_no]
dt_list = list(sorted_bof_df["day"].unique())
day_length_dic = {}
for each_day in dt_list:
day_df = sorted_bof_df[sorted_bof_df["day"] == each_day]
if day_df["discharge length"].max() - day_df["discharge length"].min() <= 0:
value = 0
else:
value = day_df["discharge length"].max() - day_df["discharge length"].min()
day_length_dic[each_day] = value
print(day_length_dic)
sorted_viscosity_df = viscosity_df.sort_values(by="Mixing date", ascending=True)
sorted_viscosity_df["day"] = sorted_viscosity_df["Mixing date"].dt.date
sorted_viscosity_df["day"] = sorted_viscosity_df["day"].astype("str")
extrud_visc_df = sorted_viscosity_df[
["Batch No.", "Input rubber weight(0.1kg)", "day", "Mixing date"]
]
extrud_visc_df["length_from_extruder"] = extrud_visc_df["day"].map(day_length_dic)
extrud_visc_df["length_from_extruder"] = extrud_visc_df[
"length_from_extruder"
].fillna(0)
daily_sum_weight = (
extrud_visc_df.groupby("day")["Input rubber weight(0.1kg)"].sum() / 10
)
# Add a new column 'm/kg' by dividing 'length_from_extruder' by the sum for each day
extrud_visc_df["m/kg"] = extrud_visc_df.apply(
lambda row: row["length_from_extruder"] / daily_sum_weight[row["day"]], axis=1
)
extrud_visc_df["batch_length"] = extrud_visc_df.apply(
lambda row: row["m/kg"] * row["Input rubber weight(0.1kg)"] / 10, axis=1
).astype("float64")
extrud_visc_df["batch_length"] = extrud_visc_df["batch_length"].apply(math.ceil)
extrud_visc_df["cumulative_length"] = extrud_visc_df.groupby("day")[
"batch_length"
].cumsum()
discharge_dict = (
extrud_visc_df.groupby("day")
.apply(
lambda group: group.set_index("Batch No.").to_dict()["cumulative_length"]
)
.to_dict()
)
test_sorted_extr_df = sorted_bof_df
test_df = test_sorted_extr_df
# Initialize an empty list to store batch numbers
batch_numbers = []
# Iterate through each row in the DataFrame
for index, row in test_df.iterrows():
day = row["day"]
discharge_length = row["discharge length"]
if discharge_length == 0:
batch_numbers.append(0)
else:
# Check if the day is in the dictionary
if day in discharge_dict:
# Check if discharge length is less than or equal to the corresponding batch length
batch_length_dict = discharge_dict[day]
for batch_no, batch_length in batch_length_dict.items():
if discharge_length <= batch_length:
batch_numbers.append(batch_no)
break
else:
# If no match is found in the dictionary, assign NaN to batch number
batch_numbers.append(batch_numbers[-1])
else:
# If day is not in the dictionary, assign NaN to batch number
batch_numbers.append(np.nan)
# Add the 'batch_no' column to the DataFrame
test_df["batch_no"] = batch_numbers
batch_number = 0
batch_list = []
started_with_one = False
current_day = None
for value, day in zip(list(test_df["lower door open"]), list(test_df["day"])):
if current_day != day:
current_day = day
batch_number = 0
if value == 1:
if not started_with_one:
batch_number += 1
started_with_one = True
batch_list.append(batch_number)
else:
batch_list.append(batch_number)
started_with_one = False
test_df["batch_no"] = test_df["batch_no"].astype("float")
test_df["extruder_batch_date"] = (
"Batch_"
+ test_df["batch_no"].astype("str")
+ "_"
+ test_df["day"].astype("str")
)
extruder_flag_list = []
extrud_flg_vms = []
for i, value in test_df.iterrows():
if value["batch_no"] == 0.0:
extruder_flag_list.append("false")
extrud_flg_vms.append(0)
else:
start_time = date_dict.get(value["extruder_batch_date"]).get("start_time")
end_time = date_dict.get(value["extruder_batch_date"]).get("end_time")
if (datetime.strptime(str(value["Time Stamp"]).split('+')[0], '%Y-%m-%d %H:%M:%S') > datetime.strptime(
start_time.split('+')[0], '%Y-%m-%d %H:%M:%S')) & \
(datetime.strptime(str(value["Time Stamp"]).split('+')[0], '%Y-%m-%d %H:%M:%S') < datetime.strptime(
end_time.split('+')[0], '%Y-%m-%d %H:%M:%S')):
extruder_flag_list.append("true")
extrud_flg_vms.append(1)
else:
extruder_flag_list.append("false")
extrud_flg_vms.append(0)
test_df["extruder_flag"] = extruder_flag_list
test_df["extruder_batch_diff"] = extrud_flg_vms
test_df["updtaed_bt_list"] = batch_list
test_df["extruder_batch_number"] = test_df["batch_no"] - test_df[
"extruder_batch_diff"
].astype("float")
test_df["batch-date"] = (
"Batch_"
+ test_df["extruder_batch_number"].astype("str")
+ "_"
+ test_df["day"].astype("str")
)
return test_df
def preprocess_extruder_section(df, index_number, vis_df):
extruder_cols = ExtruderConstants.extruder_cols
additional_columns = ['Time Stamp']
df_extruder = df[extruder_cols + additional_columns]
df_extruder['Time Stamp'] = pd.to_datetime(df_extruder['Time Stamp'])
df_extruder = df_extruder.sort_values(by='Time Stamp')
df_extruder['day'] = df_extruder['Time Stamp'].dt.date
df_extruder['day'] = df_extruder['day'].astype('str')
sorted_extrud_df = df_extruder.sort_values(by="Time Stamp", ascending=True)
sorted_extrud_df = sorted_extrud_df[sorted_extrud_df['Size No (INDEX No).4'] == index_number]
drop_col = ['spare.19',
'spare.20',
'spare.21',
'spare.22',
'spare.23',
'spare.24',
'spare.25', 'Hopper bank upper limit',
'middle of hopper bank',
'Hopper bank lower limit',
'Hopper bank below lower limit']
sorted_extrud_df.drop(columns=drop_col, inplace=True)
date_dict = mixer_section_start_end_time(df, index_number)
additional_cols = ['day', 'Time Stamp', 'lower door open']
# adding date col to the viscosity df
vis_df = vis_df.sort_values(by='Mixing date')
vis_df['date'] = vis_df['Mixing date'].dt.date
vis_df['batch-date'] = 'Batch_' + vis_df['Batch No.'].astype('float').astype(str) + '_' + vis_df[
'date'].astype(str)
vis_df = vis_df[vis_df['Index No'] == index_number]
extruder_merged_df_final = return_batch_no_df(df, vis_df, date_dict, extruder_cols, additional_cols,
index_number)
extruder_merged_df_final = extruder_merged_df_final[extruder_merged_df_final['extruder_batch_number'] != 0]
grouped_cols = ['batch-date']
aggregate_dict = ExtruderConstants.aggregate_dict
df_extruder_grouped = extruder_merged_df_final.groupby(grouped_cols).agg(aggregate_dict).reset_index()
col_renamer = {}
for col, col_agg in aggregate_dict.items():
if col not in ['viscosity', 'time_min', 'time_max', 'Mixing Weight (Integrated Value)_diff', 'max_rpm_count']:
renamed_col = f'{col.replace("(", "").replace(")", "").replace(" ", "_")}_{col_agg}'.lower()
col_renamer[col] = renamed_col
else:
col_renamer[col] = col
df_extruder_grouped = df_extruder_grouped.rename(columns=col_renamer)
df_extruder_grouped = df_extruder_grouped.fillna(df_extruder_grouped.mean())
df_extruder_grouped = round(df_extruder_grouped, 6)
return df_extruder_grouped
import warnings
import pandas as pd
from scripts.constants.constants import ViscosityConstants
warnings.filterwarnings("ignore")
def preprocess_viscosity_section(path, index_number,viscosity_skip_rows):
viscosity_df = pd.read_excel(path, skiprows=viscosity_skip_rows)
# adding date col to the viscosity df
viscosity_df = viscosity_df.sort_values(by='Mixing date')
raw_viscosity_df = viscosity_df.sort_values(by='Mixing date')
viscosity_df['date'] = viscosity_df['Mixing date'].dt.date
viscosity_df['batch-date'] = 'Batch_' + viscosity_df['Batch No.'].astype(str) + '_' + viscosity_df['date'].astype(
str)
viscosity_df = viscosity_df[viscosity_df['Index No'] == index_number]
rubber_cols = ViscosityConstants.rubber_cols
# Replace '-' with 0 for numerical and float columns
viscosity_df[rubber_cols] = viscosity_df[rubber_cols].replace('-', 0)
viscosity_df[rubber_cols] = viscosity_df[rubber_cols].apply(pd.to_numeric, errors='coerce')
# Identify numerical and float columns
numerical_cols = viscosity_df.columns[
viscosity_df.dtypes.apply(lambda x: pd.api.types.is_numeric_dtype(x) or pd.api.types.is_float_dtype(x))]
integer_cols = viscosity_df.columns[viscosity_df.dtypes == 'int64']
# Convert integer columns to float
viscosity_df[integer_cols] = viscosity_df[integer_cols].astype(float)
# Calculate weights
viscosity_df['Weight_type1'] = round(viscosity_df['Quantity using type1 bale'] / (
viscosity_df['Quantity using type1 bale'] + viscosity_df['Quantity using type2 bale']), 2)
viscosity_df['Weight_type2'] = round(viscosity_df['Quantity using type2 bale'] / (
viscosity_df['Quantity using type1 bale'] + viscosity_df['Quantity using type2 bale']), 2)
viscosity_df['Weighted_PO_type'] = (
viscosity_df['PO_type1'] * viscosity_df['Weight_type1'] + viscosity_df[f'PO_type2'] * viscosity_df[
'Weight_type2'])
viscosity_df['Weighted_DIRT_type'] = (
viscosity_df['DIRT_type1'] * viscosity_df['Weight_type1'] + viscosity_df['DIRT_type1.1'] * viscosity_df[
'Weight_type2'])
viscosity_df['Weighted_ASH_type'] = (
viscosity_df['ASH_type1'] * viscosity_df['Weight_type1'] + viscosity_df['ASH_type2'] * viscosity_df[
'Weight_type2'])
viscosity_df['Weighted_VM_type'] = (
viscosity_df['VM_type1'] * viscosity_df['Weight_type1'] + viscosity_df['VM_type2'] * viscosity_df[
'Weight_type2'])
viscosity_df['Weighted_PRI_type'] = (
viscosity_df['PRI_type1'] * viscosity_df['Weight_type1'] + viscosity_df[f'PRI_type2'] * viscosity_df[
'Weight_type2'])
viscosity_df['Weighted_NITROGEN_type'] = (
viscosity_df['NITROGEN_type1'] * viscosity_df['Weight_type1'] + viscosity_df['NITROGEN_type2'] *
viscosity_df['Weight_type2'])
viscosity_df['Weighted_Temperature during transportation_type[℃]'] = (
viscosity_df['Temperature during transportation_type1[℃]'] * viscosity_df['Weight_type1'] +
viscosity_df['Temperature during transportation_type2[℃]'] * viscosity_df['Weight_type2'])
viscosity_df['Weighted_Humidity during transportation__type[%]'] = (
viscosity_df['Humidity during transportation_type1[%]'] * viscosity_df['Weight_type1'] +
viscosity_df['Humidity during transportation__type2[%]'] * viscosity_df['Weight_type2'])
viscosity_df['Weighted Sum'] = viscosity_df['Weighted_PO_type'] + viscosity_df['Weighted_DIRT_type'] + viscosity_df[
'Weighted_ASH_type'] + viscosity_df['Weighted_VM_type'] + viscosity_df['Weighted_PRI_type'] + viscosity_df[
'Weighted_NITROGEN_type']
column_to_keep_at_end = 'viscosity'
# Reorder columns
new_order = [col for col in viscosity_df.columns if col != column_to_keep_at_end] + [column_to_keep_at_end]
viscosity_df = viscosity_df[new_order]
viscosity_df['batch-date'] = 'Batch_' + viscosity_df['Batch No.'].astype(str) + '_' + viscosity_df['date'].astype(
str)
req_cols = ViscosityConstants.req_cols
final_viscosity_df = viscosity_df[req_cols]
final_viscosity_df = round(final_viscosity_df, 6)
return final_viscosity_df, raw_viscosity_df
import warnings
import numpy as np
import pandas as pd
from loguru import logger
from scripts.constants.constants import MixerConstants
warnings.filterwarnings("ignore")
def preprocess(df):
logger.info("Starting Preprocessing the Data")
# Replace 'nan' with NaN
df = df.replace('nan', np.nan)
# Calculate the number of missing values in each column
missing_counts = df.isnull().sum()
# Get the column names where the number of missing values is equal to the number of rows
cols_to_remove = missing_counts[missing_counts == len(df)].index
df = df.drop(cols_to_remove, axis=1)
df = df.loc[df['Mixing batch number'] != 0]
# Drop rows where 'Batch Number' is NaN
df = df.dropna(subset=['Mixing batch number'])
# Identify constant columns
constant_columns = df.columns[df.nunique() == 1]
# Drop constant columns
df.drop(columns=constant_columns, inplace=True)
logger.info(f"Preprocessing completed and the final shape is {df.shape}")
columns_with_missing_values = df.columns[df.isnull().sum() > 0].tolist()
return df
def preprocess_mixer_section(df, index_number):
mixer_cols = MixerConstants.mixer_cols
mixer_df = df[mixer_cols]
mixer_df['Time Stamp'] = pd.to_datetime(mixer_df['Time Stamp'])
mixer_df = mixer_df.sort_values(by='Time Stamp')
numeric_cols = mixer_df.select_dtypes(include=['int', 'float']).columns
# Convert numeric columns to float
mixer_df[numeric_cols] = mixer_df[numeric_cols].astype(float)
mixer_df['day'] = mixer_df['Time Stamp'].dt.date
mixer_df = mixer_df[mixer_df["Size No (INDEX No).3"] == index_number]
mixer_df = mixer_df[mixer_df["Mixing batch number"] != 0]
mixer_df['Mixing Weight (Integrated Value)_diff'] = mixer_df.groupby(['day', 'Mixing batch number'])[
'Mixing Weight (Integrated Value)'].transform(lambda x: x.max() - x.min())
mixer_cleaned_df = preprocess(mixer_df)
mixer_cleaned_df["day"] = mixer_cleaned_df['Time Stamp'].dt.date
mixer_cleaned_df['mixer_on_or_off'] = mixer_cleaned_df['Mixing timer value'].apply(lambda x: 0 if x == 0 else 1)
mixer_cleaned_df['batch-date'] = 'Batch_' + mixer_cleaned_df['Mixing batch number'].astype(str) + '_' + \
mixer_cleaned_df['day'].astype(str)
mixer_cleaned_df = mixer_cleaned_df.sort_values(by='Time Stamp')
# Group by 'batch-date' and add a new column 'rubber_addition'
df['rubber_addition'] = 0
def apply_conditions(group):
max_value_index = group['Mixing timer value'].idxmax()
group.loc[group['Mixing timer value'] != group['Mixing timer value'].max(), 'rubber_addition'] = 1
group.loc[max_value_index, 'rubber_addition'] = 1
return group
mixer_cleaned_df = mixer_cleaned_df.groupby('batch-date').apply(apply_conditions)
# Add 'process_on_or_off' column based on conditions
mixer_cleaned_df['process_on_or_off'] = 0
mixer_cleaned_df.loc[(mixer_cleaned_df['mixer_on_or_off'] == 1) & (
mixer_cleaned_df['rubber_addition'] == 1), 'process_on_or_off'] = 1
numeric_cols = mixer_cleaned_df.select_dtypes(include=['number', 'float']).columns
process_on_df = mixer_cleaned_df[mixer_cleaned_df['process_on_or_off'] == 1]
df_full = process_on_df[process_on_df.columns]
# Define a dictionary for data type conversions
conversion_dict = {col: float for col in df_full.select_dtypes(include='number').columns}
# Apply the data type conversions
df_full = df_full.astype(conversion_dict)
rpm_count = df_full[df_full['Rotor actual rpm'] == 60.0].groupby('batch-date')['Rotor actual rpm'].count()
df_full = df_full.merge(rpm_count, left_on='batch-date', right_index=True, suffixes=('', '_count'))
df_full.rename(columns={'Rotor actual rpm_count': 'max_rpm_count'}, inplace=True)
aggregation_dict = MixerConstants.aggregation_dict
group_by = ['day', 'Mixing batch number']
df_mixer_grouped = df_full.groupby(group_by).agg(aggregation_dict).reset_index()
col_renamer = {}
for col, col_agg in aggregation_dict.items():
if col not in ['viscosity', 'time_min', 'time_max', 'Mixing Weight (Integrated Value)_diff', 'max_rpm_count']:
renamed_col = f'{col.replace("(", "").replace(")", "").replace(" ", "_")}_{col_agg}'.lower()
col_renamer[col] = renamed_col
else:
col_renamer[col] = col
df_mixer_grouped = df_mixer_grouped.rename(columns=col_renamer)
df_mixer_grouped['batch-date'] = 'Batch_' + df_mixer_grouped['Mixing batch number'].astype(str) + '_' + \
df_mixer_grouped['day'].astype(str)
df_mixer_grouped = round(df_mixer_grouped, 6)
return df_mixer_grouped
\ No newline at end of file
import os
import re
import mlflow
from loguru import logger
REQUIRED_TZ = "Asia/Kolkata"
mlflow_tracking_uri = 'https://qa.unifytwin.com/mlflow/'
AZURE_STORAGE_CONNECTION_STRING = 'DefaultEndpointsProtocol=https;AccountName=azrmlilensqa006382180551;AccountKey=tD' \
'GOKfiZ2svfoMvVmS0Fbpf0FTHfTq4wKYuDX7cAxlhve/3991QuzdvJHm9vWc+lo6mtC+x9yPSghWNR' \
'4+gacg==;EndpointSuffix=core.windows.net'
AZURE_STORAGE_ACCESS_KEY = 'tDGOKfiZ2svfoMvVmS0Fbpf0FTHfTq4wKYuDX7cAxlhve/3991QuzdvJHm9vWc+lo6mtC+x9yPSghWNR4+gacg=='
os.environ["MLFLOW_TRACKING_USERNAME"] = 'mlflow'
os.environ["MLFLOW_TRACKING_PASSWORD"] = 'MlFlOwQA#4321'
os.environ["AZURE_STORAGE_CONNECTION_STRING"] = AZURE_STORAGE_CONNECTION_STRING
os.environ["AZURE_STORAGE_ACCESS_KEY"] = AZURE_STORAGE_ACCESS_KEY
mlflow.set_tracking_uri(mlflow_tracking_uri)
mlflow.set_registry_uri(mlflow_tracking_uri)
client = mlflow.tracking.MlflowClient()
class MlFlowUtil:
@staticmethod
def log_model(model, model_name):
try:
mlflow.sklearn.log_model(model, model_name)
logger.info("logged the model")
return True
except Exception as e:
logger.exception(str(e))
@staticmethod
def log_metrics(metrics):
try:
updated_metric = {}
for key, value in metrics.items():
key = re.sub(r"[([{})\]]", "", key)
updated_metric[key] = value
mlflow.log_metrics(updated_metric)
return True
except Exception as e:
logger.exception(str(e))
@staticmethod
def log_hyper_param(hyper_params):
try:
mlflow.log_params(hyper_params)
return True
except Exception as e:
logger.exception(str(e))
@staticmethod
def set_tag(child_run_id, key, value):
try:
client.set_tag(run_id=child_run_id, key=key, value=value)
except Exception as e:
logger.exception(f"Exception while setting the tag - {e}")
class ModelLoaderSaver:
def __init__(self, model, metrics, hyperparams, experiment_name, parent_run_name, model_save_name, model_type):
self.model = model
self.metrics = metrics
self.hyperparams = hyperparams
self.experiment_name = experiment_name
self.parent_run_name = parent_run_name
self.model_save_name = model_save_name
self.model_type = model_type
self._mfu_ = MlFlowUtil()
def check_create_experiment(self):
"""
check if experiment exists, if not creates a new experiment
:return: experiment_id of the experiment
"""
experiment_info = mlflow.get_experiment_by_name(self.experiment_name)
if experiment_info is None:
logger.info(f"No experiment found with name {self.experiment_name}, So creating one")
mlflow.create_experiment(self.experiment_name)
else:
logger.info(f"Proceeding with existing Experiment {self.experiment_name}")
mlflow.set_experiment(experiment_name=self.experiment_name)
experiment_info = mlflow.get_experiment_by_name(self.experiment_name)
experiment_id = experiment_info.experiment_id
return experiment_id
def check_create_parent_run(self, experiment_id):
"""
check if a parent run exists in the experiment, if not create it with the mentioned parent run name
:param experiment_id: Experiment id
:return: returns the parent run id
"""
parent_runs_df = mlflow.search_runs(experiment_id)
run_key = 'tags.mlflow.runName'
if run_key in parent_runs_df.columns:
parent_runs_df = parent_runs_df[parent_runs_df[run_key] == self.parent_run_name]
else:
parent_runs_df = parent_runs_df.iloc[:0]
if not parent_runs_df.empty:
logger.info(f"Proceeding with existing Parent Run {self.parent_run_name}")
return list(parent_runs_df['run_id'])[0]
# no parent run found
logger.info(f"No Parent Run present {self.parent_run_name}")
with mlflow.start_run(experiment_id=experiment_id, run_name=self.parent_run_name) as run:
logger.info(f"Creating the parent Run {self.parent_run_name} with Parent Run Id {run.info.run_id}")
return run.info.run_id
def check_create_child_run(self, experiment_id, parent_run_id):
"""
check if a child run exists in the experiment id under the parent run id
if exists take the child run id which has the model saved and validate when was it lastly trained.
Based on the lastly trained see if you have to retrain or not. if retrain create a new child run
else if no child run exists under the parent run id of experiment id, create a new child run
:param experiment_id: experiment id
:param parent_run_id: parent run id
:return: child run id, retrain flag
"""
child_runs_df = mlflow.search_runs(experiment_id, filter_string=f"tags.mlflow.parentRunId='{parent_run_id}'")
if not child_runs_df.empty:
logger.info(f"Already Child runs are present for Parent Run Id {parent_run_id}")
child_run_id, retrain = self.get_latest_child_run(experiment_id, parent_run_id, child_runs_df)
return child_run_id, retrain
else:
logger.info(f"Child runs are not present for Parent Run Id {parent_run_id}")
with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True):
with mlflow.start_run(experiment_id=experiment_id, nested=True) as child_run:
return child_run.info.run_id, True
def get_latest_child_run(self, experiment_id, parent_run_id, runs_df):
"""
Check if child runs are present. if not create a new child run. Otherwise, validate the last run time and
create a new child run if retraining needed or take the last child run id which has model saved
:param experiment_id: experiment id
:param parent_run_id: parent run id
:param runs_df: the child runs of the parent id
:return: last child run id, retrain flag
"""
history_key = 'tags.mlflow.log-model.history'
if history_key in runs_df.columns:
runs_df = runs_df[runs_df[history_key].notna()]
else:
runs_df = runs_df.iloc[:0]
if runs_df.empty:
logger.info("Existing Child Runs doesn't contain any model to run. So creating new child run")
with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True):
with mlflow.start_run(experiment_id=experiment_id, nested=True) as child_run:
return child_run.info.run_id, True
latest_child_run_id = list(runs_df['run_id'])[0]
latest_run_info = runs_df.iloc[:1]
retrain = False
# day_check_flag = self.check_existing_model_retrain(latest_child_run_id, latest_run_info, retrain)
# if day_check_flag:
if self.model is not None:
with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True):
with mlflow.start_run(experiment_id=experiment_id, nested=True) as child_run:
return child_run.info.run_id, True
return latest_child_run_id, retrain
@staticmethod
def load_model_pyfunc(model_path):
"""
Function to load the model from mlflow artifact path
:param model_path: model path on mlflow
:return: loaded model
"""
try:
model = mlflow.pyfunc.load_model(model_path)
logger.info("loading the model")
return model
except Exception as e:
logger.exception(str(e))
def check_existing_model_retrain(self, latest_child_run_id, child_run_info, retrain):
"""
If retrain is True, it returns true as retraining is required.
If retrain is False, it checks the time difference between the last child run and the current time and returns
true or false depending on the time difference
:param latest_child_run_id: last child run id
:param child_run_info: last child run info
:param retrain: retrain flag
:return: final retrain flag
"""
if retrain:
logger.info("Retraining Needed...")
return True
else:
# uncomment this if auto train should happen
logger.info(f"Already trained model is present, checking the age of the existing model of run id "
f"{latest_child_run_id}")
# time_diff = self._mfu_.get_last_run_time_diff(child_run_info, self.retrain_param_unit)
# logger.info(f"Time difference is {time_diff} {self.retrain_param_unit}")
# if time_diff >= self.retrain_param_value:
# logger.info(f"Retraining needed as the last trained model time exceeds the mentioned time difference "
# f"{self.retrain_param_value} {self.retrain_param_unit}")
# return True
# else:
# logger.info(f"Retraining not needed as the last trained model time doesnt exceeds the mentioned time "
# f"difference {self.retrain_param_value} {self.retrain_param_unit}")
# return False
return True
def forming_loading_path(self, latest_run_id):
"""
Creates the path from the child run id
:param latest_run_id: latest child run id
:return: the path to the model
"""
try:
model_name = self.model_save_name
return f"runs:/{latest_run_id}/{model_name}"
except Exception as e:
logger.exception(f"Exception while forming loading path - {e}")
def model_trainer(self, experiment_id, parent_run_id, child_run_id):
"""
Using the experiment id, parent run id and child run id, it will train the model
:param experiment_id: experiment id
:param parent_run_id: parent run id
:param child_run_id: child run id
:return: the final model
"""
with mlflow.start_run(experiment_id=experiment_id, run_id=parent_run_id, nested=True):
with mlflow.start_run(experiment_id=experiment_id, run_id=child_run_id, nested=True):
# metrics, hyperparams, model_name, model = self._pycaret_obj_.perform_task()
# logger.info(f"Model is {model_name}")
logger.info(f"Metrics: {self.metrics}")
logger.info(f"Hyperparams: {self.hyperparams}")
self._mfu_.log_model(model=self.model, model_name=self.model_save_name)
model_name = 'ExtraTreeRegressor'
self._mfu_.log_metrics(metrics=self.metrics)
self._mfu_.log_hyper_param(hyper_params=self.hyperparams)
self._mfu_.set_tag(child_run_id=child_run_id, key="algorithm", value=model_name)
return self.model
def get_latest_model(self):
"""
This is the Main function which will return the latest model
:return:
"""
experiment_id = self.check_create_experiment()
parent_run_id = self.check_create_parent_run(experiment_id)
child_run_id, retrain = self.check_create_child_run(experiment_id, parent_run_id)
logger.info(f"Retrain flag is {retrain}")
if self.model is not None:
logger.info("Retraining needed")
self.model_trainer(experiment_id, parent_run_id, child_run_id)
logger.info("New model trained successfully")
else:
logger.info(f"No retraining needed. proceeding to load the last child run model {child_run_id}")
logger.info(f"Loading the model from the child run id {child_run_id}")
final_model = self.load_model_pyfunc(model_path=self.forming_loading_path(latest_run_id=child_run_id))
return final_model
import math
import warnings
import traceback
from loguru import logger
from datetime import datetime
import numpy as np
import pandas as pd
from scripts.constants.constants import PickupConstants
warnings.filterwarnings("ignore")
def get_mixer_batch_date(raw_df, index_number):
try:
logger.info('Getting mixer batch date dictionary')
mixer_df = raw_df[PickupConstants.pick_imp_mixer_cols]
mixer_df = mixer_df.sort_values(by='Time Stamp')
numeric_cols = mixer_df.select_dtypes(include=['int', 'float']).columns
# Convert numeric columns to float
mixer_df[numeric_cols] = mixer_df[numeric_cols].astype(float)
mixer_df['day'] = mixer_df['Time Stamp'].dt.date
mixer_df = mixer_df[mixer_df["Size No (INDEX No).3"] == index_number]
mixer_df = mixer_df[mixer_df["Mixing batch number"] != 0]
mixer_df['time_min'] = mixer_df['Time Stamp']
mixer_df['time_max'] = mixer_df['Time Stamp']
aggregation_dict = {
'time_min': 'min',
'time_max': 'max',
}
group_by = ['day', 'Mixing batch number']
df_mixer_grouped = mixer_df.groupby(group_by).agg(aggregation_dict).reset_index()
df_mixer_grouped['mixer_section_time_diff_second'] = df_mixer_grouped['time_max'] - df_mixer_grouped['time_min']
df_mixer_grouped['mixer_section_time_diff_second'] = df_mixer_grouped[
'mixer_section_time_diff_second'].dt.total_seconds()
df_mixer_grouped['batch-date'] = 'Batch_' + df_mixer_grouped['Mixing batch number'].astype(str) + '_' + \
df_mixer_grouped['day'].astype(str)
date_dict = {}
batch_lis = list(df_mixer_grouped['batch-date'].unique())
for each_bt in batch_lis:
df_nw = df_mixer_grouped[df_mixer_grouped['batch-date'] == each_bt]
date_dict[each_bt] = {"start_time": str(list(df_nw['time_min'])[0]),
'end_time': str(list(df_nw['time_max'])[0])}
return date_dict
except Exception as err:
logger.error(f'Error while getting mixer time and forming date dict: {str(err)}')
logger.error(traceback.format_exc())
raise Exception(str(err))
def return_batch_no_bof_df(raw_df, viscosity_df, date_dict, index_number):
try:
logger.info('Getting bof batch number')
raw_df['day'] = raw_df['Time Stamp'].dt.date
raw_df['day'] = raw_df['day'].astype('str')
raw_df['Mixing batch number'] = raw_df['Mixing batch number'].astype('float')
raw_df['batch-date'] = 'Batch_' + raw_df['Mixing batch number'].astype(
'str') + '_' + raw_df['day'].astype('str')
bof_add_cols = ['Size No (INDEX No).5', 'length passed through', 'Time Stamp', 'day', 'lower door open']
bof_df = raw_df[bof_add_cols]
sorted_bof_df = bof_df.sort_values(by="Time Stamp", ascending=True)
sorted_bof_df = sorted_bof_df[sorted_bof_df['Size No (INDEX No).5'] == index_number]
dt_list = list(sorted_bof_df['day'].unique())
day_length_dic = {}
for each_day in dt_list:
day_df = sorted_bof_df[sorted_bof_df['day'] == each_day]
if (day_df['length passed through'].max() - day_df['length passed through'].min()) <= 0:
value = 0
else:
value = day_df['length passed through'].max() - day_df['length passed through'].min()
day_length_dic[each_day] = value
sorted_viscosity_df = viscosity_df.sort_values(by="Mixing date", ascending=True)
sorted_viscosity_df['day'] = sorted_viscosity_df['Mixing date'].dt.date
sorted_viscosity_df['day'] = sorted_viscosity_df['day'].astype('str')
extrud_visc_df = sorted_viscosity_df[['Batch No.', 'Input rubber weight(0.1kg)', 'day', 'Mixing date']]
extrud_visc_df['length_from_extruder'] = extrud_visc_df['day'].map(day_length_dic)
extrud_visc_df['length_from_extruder'] = extrud_visc_df['length_from_extruder'].fillna(0)
daily_sum_weight = extrud_visc_df.groupby('day')['Input rubber weight(0.1kg)'].sum() / 10
# Add a new column 'm/kg' by dividing 'length_from_extruder' by the sum for each day
extrud_visc_df['m/kg'] = extrud_visc_df.apply(
lambda row: row['length_from_extruder'] / daily_sum_weight[row['day']], axis=1)
extrud_visc_df['batch_length'] = extrud_visc_df.apply(
lambda row: row['m/kg'] * row['Input rubber weight(0.1kg)'] / 10, axis=1).astype('float64')
extrud_visc_df['batch_length'] = extrud_visc_df['batch_length'].apply(math.ceil)
extrud_visc_df['cumulative_length'] = extrud_visc_df.groupby('day')['batch_length'].cumsum()
discharge_dict = extrud_visc_df.groupby('day').apply(
lambda group: group.set_index('Batch No.').to_dict()['cumulative_length']).to_dict()
test_sorted_extr_df = sorted_bof_df
test_df = test_sorted_extr_df
# Initialize an empty list to store batch numbers
batch_numbers = []
# Iterate through each row in the DataFrame
for index, row in test_df.iterrows():
day = row['day']
discharge_length = row['length passed through']
if discharge_length == 0:
batch_numbers.append(0)
else:
# Check if the day is in the dictionary
if day in discharge_dict:
# Check if discharge length is less than or equal to the corresponding batch length
batch_length_dict = discharge_dict[day]
for batch_no, batch_length in batch_length_dict.items():
if discharge_length <= batch_length:
batch_numbers.append(batch_no)
break
else:
# If no match is found in the dictionary, assign NaN to batch number
batch_numbers.append(batch_numbers[-1])
else:
# If day is not in the dictionary, assign NaN to batch number
batch_numbers.append(np.nan)
# Add the 'batch_no' column to the DataFrame
test_df['batch_no'] = batch_numbers
batch_number = 0
batch_list = []
started_with_one = False
current_day = None
for value, day in zip(list(test_df['lower door open']), list(test_df['day'])):
if current_day != day:
current_day = day
batch_number = 0
if value == 1:
if not started_with_one:
batch_number += 1
started_with_one = True
batch_list.append(batch_number)
else:
batch_list.append(batch_number)
started_with_one = False
batch_number = 0
batch_list = []
started_with_one = False
for value in test_df['lower door open']:
if value == 1:
if not started_with_one:
batch_number += 1
started_with_one = True
batch_list.append(batch_number)
else:
batch_list.append(batch_number)
started_with_one = False
test_df['batch_no'] = test_df['batch_no'].astype('float')
test_df['bof_batch_date'] = 'Batch_' + test_df['batch_no'].astype('str') + '_' + test_df['day'].astype('str')
extruder_flag_list = []
extrud_flg_vms = []
for i, value in test_df.iterrows():
if value['batch_no'] == 0.0:
extruder_flag_list.append('false')
extrud_flg_vms.append(0)
else:
# start_time = np.datetime64(date_dict.get(value['bof_batch_date']).get('start_time'))
# end_time = np.datetime64(date_dict.get(value['bof_batch_date']).get('end_time'))
start_time = date_dict.get(value["bof_batch_date"]).get("start_time")
end_time = date_dict.get(value["bof_batch_date"]).get("end_time")
if (datetime.strptime(str(value["Time Stamp"]).split('+')[0], '%Y-%m-%d %H:%M:%S') > datetime.strptime(
start_time.split('+')[0], '%Y-%m-%d %H:%M:%S')) & \
(datetime.strptime(str(value["Time Stamp"]).split('+')[0], '%Y-%m-%d %H:%M:%S') < datetime.strptime(
end_time.split('+')[0], '%Y-%m-%d %H:%M:%S')):
extruder_flag_list.append('true')
extrud_flg_vms.append(1)
else:
extruder_flag_list.append('false')
extrud_flg_vms.append(0)
test_df['bof_flag'] = extruder_flag_list
test_df['bof_batch_diff'] = extrud_flg_vms
test_df['updated_bt_list'] = batch_list
test_df['bof_batch_number'] = test_df['batch_no'] - test_df['bof_batch_diff'].astype('float')
test_df = test_df.rename(columns={'bof_batch_date': 'batch-date'})
return test_df
except Exception as err:
logger.error(f'Error while adding batch to bof section: {str(err)}')
logger.error(traceback.format_exc())
raise Exception(str(err))
def get_bof_batch_date(bof_batch_df, index_number):
try:
logger.info('Getting bof batch date dictionary')
bof_cols = PickupConstants.pick_imp_bof_cols
bof_df = bof_batch_df[bof_cols]
bof_df = bof_df.sort_values(by='Time Stamp')
numeric_cols = bof_df.select_dtypes(include=['int', 'float']).columns
# Convert numeric columns to float
bof_df[numeric_cols] = bof_df[numeric_cols].astype(float)
bof_df['day'] = bof_df['Time Stamp'].dt.date
bof_df = bof_df[bof_df["Size No (INDEX No).5"] == index_number]
bof_df = bof_df[bof_df["bof_batch_number"] != 0]
bof_df['time_min'] = bof_df['Time Stamp']
bof_df['time_max'] = bof_df['Time Stamp']
aggregation_dict = {
'time_min': 'min',
'time_max': 'max',
}
group_by = ['day', 'bof_batch_number']
df_bof_grouped = bof_df.groupby(group_by).agg(aggregation_dict).reset_index()
df_bof_grouped['mixer_section_time_diff_second'] = df_bof_grouped['time_max'] - df_bof_grouped['time_min']
df_bof_grouped['mixer_section_time_diff_second'] = df_bof_grouped[
'mixer_section_time_diff_second'].dt.total_seconds()
df_bof_grouped['batch-date'] = 'Batch_' + df_bof_grouped['bof_batch_number'].astype(str) + '_' + df_bof_grouped[
'day'].astype(str)
bof_date_dict = {}
batch_lis = list(df_bof_grouped['batch-date'].unique())
for each_bt in batch_lis:
df_nw = df_bof_grouped[df_bof_grouped['batch-date'] == each_bt]
bof_date_dict[each_bt] = {"start_time": str(list(df_nw['time_min'])[0]),
'end_time': str(list(df_nw['time_max'])[0])}
return bof_date_dict
except Exception as err:
logger.error(f'Error while getting bof time and forming bof date dict: {str(err)}')
logger.error(traceback.format_exc())
raise Exception(str(err))
def return_pick_batch_no_df(raw_df, viscosity_df, bof_date_dict, bof_batch_num_df, index_number):
try:
logger.info('Getting pickup batch date dataframe')
raw_df['day'] = raw_df['Time Stamp'].dt.date
raw_df['day'] = raw_df['day'].astype('str')
raw_df['Mixing batch number'] = raw_df['Mixing batch number'].astype('float')
raw_df['batch-date'] = 'Batch_' + raw_df['Mixing batch number'].astype('str') + '_' + raw_df['day'].astype(
'str')
pick_add_cols = PickupConstants.pick_cols + PickupConstants.pick_additional_cols
pick_df = raw_df[pick_add_cols]
sorted_pick_df = pick_df.sort_values(by="Time Stamp", ascending=True)
sorted_pick_df = sorted_pick_df[sorted_pick_df['Size No (INDEX No).6'] == index_number]
dt_list = list(sorted_pick_df['day'].unique())
day_length_dic = {}
for each_day in dt_list:
day_df = sorted_pick_df[sorted_pick_df['day'] == each_day]
if day_df['length passed through.1'].max() - day_df['length passed through.1'].min() <= 0:
value = 0
else:
value = day_df['length passed through.1'].max() - day_df['length passed through.1'].min()
day_length_dic[each_day] = value
sorted_viscosity_df = viscosity_df.sort_values(by="Mixing date", ascending=True)
sorted_viscosity_df['day'] = sorted_viscosity_df['Mixing date'].dt.date
sorted_viscosity_df['day'] = sorted_viscosity_df['day'].astype('str')
extrud_visc_df = sorted_viscosity_df[['Batch No.', 'Input rubber weight(0.1kg)', 'day', 'Mixing date']]
extrud_visc_df['length_from_pickup'] = extrud_visc_df['day'].map(day_length_dic)
extrud_visc_df['length_from_pickup'] = extrud_visc_df['length_from_pickup'].fillna(0)
daily_sum_weight = extrud_visc_df.groupby('day')['Input rubber weight(0.1kg)'].sum() / 10
# Add a new column 'm/kg' by dividing 'length_from_pickup' by the sum for each day
extrud_visc_df['m/kg'] = extrud_visc_df.apply(
lambda row: row['length_from_pickup'] / daily_sum_weight[row['day']], axis=1)
extrud_visc_df['batch_length'] = extrud_visc_df.apply(
lambda row: row['m/kg'] * row['Input rubber weight(0.1kg)'] / 10, axis=1).astype('float64')
extrud_visc_df['batch_length'] = extrud_visc_df['batch_length'].apply(math.ceil)
extrud_visc_df['cumulative_length'] = extrud_visc_df.groupby('day')['batch_length'].cumsum()
discharge_dict = extrud_visc_df.groupby('day').apply(
lambda group: group.set_index('Batch No.').to_dict()['cumulative_length']).to_dict()
test_sorted_extr_df = sorted_pick_df
test_pick_df = test_sorted_extr_df
# Initialize an empty list to store batch numbers
batch_numbers = []
# Iterate through each row in the DataFrame
for index, row in test_pick_df.iterrows():
day = row['day']
discharge_length = row['length passed through.1']
if discharge_length == 0:
batch_numbers.append(0)
else:
# Check if the day is in the dictionary
if day in discharge_dict:
# Check if discharge length is less than or equal to the corresponding batch length
batch_length_dict = discharge_dict[day]
for batch_no, batch_length in batch_length_dict.items():
if discharge_length <= batch_length:
batch_numbers.append(batch_no)
break
else:
# If no match is found in the dictionary, assign NaN to batch number
batch_numbers.append(batch_numbers[-1])
else:
# If day is not in the dictionary, assign NaN to batch number
batch_numbers.append(np.nan)
# Add the 'batch_no' column to the DataFrame
test_pick_df['batch_no'] = batch_numbers
test_pick_df['batch_no'] = test_pick_df['batch_no'].astype('float')
test_pick_df['pickup_batch_date'] = 'Batch_' + test_pick_df['batch_no'].astype('str') + '_' + test_pick_df[
'day'].astype('str')
test_pick_df['bof_batch_number'] = bof_batch_num_df['bof_batch_number']
extruder_flag_list = []
extrud_flg_vms = []
for i, value in test_pick_df.iterrows():
if value['batch_no'] == 0.0:
extruder_flag_list.append('false')
extrud_flg_vms.append(0)
else:
# start_time = np.datetime64(bof_date_dict.get(value['pickup_batch_date']).get('start_time'))
# end_time = np.datetime64(bof_date_dict.get(value['pickup_batch_date']).get('end_time'))
start_time = bof_date_dict.get(value["pickup_batch_date"]).get("start_time")
end_time = bof_date_dict.get(value["pickup_batch_date"]).get("end_time")
if (datetime.strptime(str(value["Time Stamp"]).split('+')[0], '%Y-%m-%d %H:%M:%S') > datetime.strptime(start_time.split('+')[0], '%Y-%m-%d %H:%M:%S')) & \
(datetime.strptime(str(value["Time Stamp"]).split('+')[0], '%Y-%m-%d %H:%M:%S') < datetime.strptime(end_time.split('+')[0], '%Y-%m-%d %H:%M:%S')):
extruder_flag_list.append('false')
extrud_flg_vms.append(0)
else:
extruder_flag_list.append('false')
extrud_flg_vms.append(0)
test_pick_df['pickup_flag'] = extruder_flag_list
test_pick_df['pickup_batch_diff'] = extrud_flg_vms
test_pick_df['pickup_batch_verify_number'] = test_pick_df['batch_no'] - test_pick_df[
'pickup_batch_diff'].astype('float')
actual_pickup_bt_num = []
for i, value in test_pick_df.iterrows():
pickup_batch_number = value['pickup_batch_verify_number']
bof_batch_num = value['bof_batch_number']
if pickup_batch_number <= bof_batch_num:
actual_pickup_bt_num.append(pickup_batch_number)
else:
actual_pickup_bt_num.append(bof_batch_num)
test_pick_df['pickup_batch_number'] = actual_pickup_bt_num
test_pick_df['batch-date'] = 'Batch_' + test_pick_df['pickup_batch_number'].astype('str') + '_' + \
test_pick_df['day'].astype('str')
return test_pick_df
except Exception as err:
logger.error(f'Error in adding batch to pick section: {str(err)}')
logger.error(traceback.format_exc())
def preprocess_pickup_section(raw_df, index_number, viscosity_df):
try:
logger.info('Preprocessing and getting aggregated pickup dataframe')
raw_df['Time Stamp'] = pd.to_datetime(raw_df['Time Stamp'])
raw_df['day'] = raw_df['Time Stamp'].dt.date
raw_df['day'] = raw_df['day'].astype('str')
extr_cols = PickupConstants.pick_cols + PickupConstants.pick_additional_cols
pick_df = raw_df[extr_cols]
sorted_pick_df = pick_df.sort_values(by="Time Stamp", ascending=True)
sorted_pick_df = sorted_pick_df[sorted_pick_df['Size No (INDEX No).6'] == index_number]
dt_list = list(sorted_pick_df['day'].unique())
day_length_dic = {}
for each_day in dt_list:
day_df = sorted_pick_df[sorted_pick_df['day'] == each_day]
day_length_dic[each_day] = day_df['length passed through.1'].max() - day_df['length passed through.1'].min()
''' Reading viscosity file with skipping 2 rows '''
viscosity_df['Mixing date'] = pd.to_datetime(viscosity_df['Mixing date'])
sorted_viscosity_df = viscosity_df.sort_values(by="Mixing date", ascending=True)
sorted_viscosity_df['day'] = sorted_viscosity_df['Mixing date'].dt.date
sorted_viscosity_df['day'] = sorted_viscosity_df['day'].astype('str')
sorted_viscosity_df['batch-date'] = 'Batch_' + viscosity_df['Batch No.'].astype(
'float').astype(str) + '_' + sorted_viscosity_df['day'].astype(str)
sorted_viscosity_df = sorted_viscosity_df[sorted_viscosity_df['Index No'] == index_number]
weight_date_dict = {}
weight_batch_dict = {}
for each_day in dt_list:
day_df = sorted_viscosity_df[sorted_viscosity_df['day'] == each_day]
summed = day_df['Input rubber weight(0.1kg)'].astype('float64').sum()
weight_date_dict[each_day] = summed
weight_batch_dict[each_day] = summed
date_dict = get_mixer_batch_date(raw_df, index_number)
bof_test_df = return_batch_no_bof_df(raw_df, sorted_viscosity_df, date_dict, index_number)
bof_date_dict = get_bof_batch_date(bof_test_df, index_number)
pick_merged_batch_df = return_pick_batch_no_df(raw_df, sorted_viscosity_df, bof_date_dict,
bof_test_df, index_number)
# Merging pick data with viscosity data on date-batch column
pickup_merged_df_final = pd.merge(pick_merged_batch_df, sorted_viscosity_df[['batch-date', 'viscosity']],
on='batch-date', how='left')
# Removing batch 0
pickup_merged_df_final = pickup_merged_df_final[pickup_merged_df_final['pickup_batch_number'] != 0]
# Grouping with aggregated data
df_pickup_grouped = pickup_merged_df_final.groupby(PickupConstants.pick_grouped_cols).agg(
PickupConstants.pick_aggregate_dict).reset_index()
col_renamer = {}
for col, col_agg in PickupConstants.pick_aggregate_dict.items():
if col not in ['viscosity', 'time_min', 'time_max', 'Mixing Weight (Integrated Value)_diff',
'max_rpm_count']:
renamed_col = f'{col.replace("(", "").replace(")", "").replace(" ", "_")}_{col_agg}'.lower()
col_renamer[col] = renamed_col
else:
col_renamer[col] = col
df_pickup_grouped = df_pickup_grouped.rename(columns=col_renamer)
df_pickup_grouped_visc = df_pickup_grouped.drop('viscosity', axis=1)
return df_pickup_grouped_visc
except Exception as err:
logger.error(f'Error while performing main function for pickup section {str(err)}')
logger.error(traceback.format_exc())
raise Exception(str(err))
\ No newline at end of file
import warnings
import pandas as pd
from scripts.constants.constants import SheetConstants
warnings.filterwarnings("ignore")
def preprocess_sheet_section(df, index_number):
sheet_supply_column = SheetConstants.sheet_supply_column
sheet_supply_df = df[sheet_supply_column]
sheet_supply_df['Time Stamp'] = pd.to_datetime(sheet_supply_df['Time Stamp'])
sheet_supply_df = sheet_supply_df.sort_values(by='Time Stamp')
numeric_cols = sheet_supply_df.select_dtypes(include=['int', 'float']).columns
# Convert numeric columns to float
sheet_supply_df[numeric_cols] = sheet_supply_df[numeric_cols].astype(float)
sheet_supply_df['day'] = sheet_supply_df['Time Stamp'].dt.date
sheet_supply_df['batch-date'] = 'Batch_' + sheet_supply_df['Weighing times'].astype(str) + '_' + sheet_supply_df[
'day'].astype(str)
sheet_supply_df = sheet_supply_df[sheet_supply_df["Size No (INDEX No)"] == index_number]
sheet_supply_df = sheet_supply_df[sheet_supply_df["Weighing times"] != 0]
aggregation_dict = SheetConstants.aggregation_dict
group_by = ['day', 'Weighing times']
df_sheet_grouped = sheet_supply_df.groupby(group_by).agg(aggregation_dict).reset_index()
col_renamer = {}
for col, col_agg in aggregation_dict.items():
if col not in ['viscosity', 'time_min', 'time_max']:
renamed_col = f'{col.replace("(", "").replace(")", "").replace(" ", "_")}_{col_agg}'.lower()
col_renamer[col] = renamed_col
else:
col_renamer[col] = col
df_sheet_grouped = df_sheet_grouped.rename(columns=col_renamer)
df_sheet_grouped['batch-date'] = 'Batch_' + df_sheet_grouped['Weighing times'].astype(str) + '_' + df_sheet_grouped[
'day'].astype(str)
df_sheet_grouped = round(df_sheet_grouped, 6)
return df_sheet_grouped
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment