Spaces:
Runtime error
Runtime error
Commit
·
2fc2c1f
1
Parent(s):
3eb0b43
Upload 19 files
Browse files- supv/__init__.py +0 -0
- supv/bacl.py +493 -0
- supv/basic_nn.py +293 -0
- supv/fftn.py +240 -0
- supv/gbt.py +482 -0
- supv/gcn.py +444 -0
- supv/knn.py +106 -0
- supv/lrd.py +112 -0
- supv/lstm.py +414 -0
- supv/mcalib.py +384 -0
- supv/mcclf.py +207 -0
- supv/nlm.py +434 -0
- supv/optunar.py +127 -0
- supv/pasearch.py +243 -0
- supv/regress.py +253 -0
- supv/rf.py +134 -0
- supv/svm.py +141 -0
- supv/svml.py +428 -0
- supv/tnn.py +789 -0
supv/__init__.py
ADDED
|
File without changes
|
supv/bacl.py
ADDED
|
@@ -0,0 +1,493 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import numpy as np
|
| 23 |
+
import sklearn as sk
|
| 24 |
+
import matplotlib
|
| 25 |
+
import random
|
| 26 |
+
import jprops
|
| 27 |
+
from io import StringIO
|
| 28 |
+
from sklearn.model_selection import cross_val_score
|
| 29 |
+
import joblib
|
| 30 |
+
from random import randint
|
| 31 |
+
from io import StringIO
|
| 32 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 33 |
+
from util import *
|
| 34 |
+
from mlutil import *
|
| 35 |
+
from pasearch import *
|
| 36 |
+
|
| 37 |
+
#base classifier class
|
| 38 |
+
class BaseClassifier(object):
|
| 39 |
+
|
| 40 |
+
def __init__(self, configFile, defValues, mname):
|
| 41 |
+
self.config = Configuration(configFile, defValues)
|
| 42 |
+
self.subSampleRate = None
|
| 43 |
+
self.featData = None
|
| 44 |
+
self.clsData = None
|
| 45 |
+
self.classifier = None
|
| 46 |
+
self.trained = False
|
| 47 |
+
self.verbose = self.config.getBooleanConfig("common.verbose")[0]
|
| 48 |
+
logFilePath = self.config.getStringConfig("common.logging.file")[0]
|
| 49 |
+
logLevName = self.config.getStringConfig("common.logging.level")[0]
|
| 50 |
+
self.logger = createLogger(mname, logFilePath, logLevName)
|
| 51 |
+
self.logger.info("********* starting session")
|
| 52 |
+
|
| 53 |
+
def initConfig(self, configFile, defValues):
|
| 54 |
+
"""
|
| 55 |
+
initialize config
|
| 56 |
+
"""
|
| 57 |
+
self.config = Configuration(configFile, defValues)
|
| 58 |
+
|
| 59 |
+
def getConfig(self):
|
| 60 |
+
"""
|
| 61 |
+
get config object
|
| 62 |
+
"""
|
| 63 |
+
return self.config
|
| 64 |
+
|
| 65 |
+
def setConfigParam(self, name, value):
|
| 66 |
+
"""
|
| 67 |
+
set config param
|
| 68 |
+
"""
|
| 69 |
+
self.config.setParam(name, value)
|
| 70 |
+
|
| 71 |
+
def getMode(self):
|
| 72 |
+
"""
|
| 73 |
+
get mode
|
| 74 |
+
"""
|
| 75 |
+
return self.config.getStringConfig("common.mode")[0]
|
| 76 |
+
|
| 77 |
+
def getSearchParamStrategy(self):
|
| 78 |
+
"""
|
| 79 |
+
get search parameter
|
| 80 |
+
"""
|
| 81 |
+
return self.config.getStringConfig("train.search.param.strategy")[0]
|
| 82 |
+
|
| 83 |
+
def train(self):
|
| 84 |
+
"""
|
| 85 |
+
train model
|
| 86 |
+
"""
|
| 87 |
+
#build model
|
| 88 |
+
self.buildModel()
|
| 89 |
+
|
| 90 |
+
# training data
|
| 91 |
+
if self.featData is None:
|
| 92 |
+
(featData, clsData) = self.prepTrainingData()
|
| 93 |
+
(self.featData, self.clsData) = (featData, clsData)
|
| 94 |
+
else:
|
| 95 |
+
(featData, clsData) = (self.featData, self.clsData)
|
| 96 |
+
if self.subSampleRate is not None:
|
| 97 |
+
(featData, clsData) = subSample(featData, clsData, self.subSampleRate, False)
|
| 98 |
+
self.logger.info("subsample size " + str(featData.shape[0]))
|
| 99 |
+
|
| 100 |
+
# parameters
|
| 101 |
+
modelSave = self.config.getBooleanConfig("train.model.save")[0]
|
| 102 |
+
|
| 103 |
+
#train
|
| 104 |
+
self.logger.info("...training model")
|
| 105 |
+
self.classifier.fit(featData, clsData)
|
| 106 |
+
score = self.classifier.score(featData, clsData)
|
| 107 |
+
successCriterion = self.config.getStringConfig("train.success.criterion")[0]
|
| 108 |
+
result = None
|
| 109 |
+
if successCriterion == "accuracy":
|
| 110 |
+
self.logger.info("accuracy with training data {:06.3f}".format(score))
|
| 111 |
+
result = score
|
| 112 |
+
elif successCriterion == "error":
|
| 113 |
+
error = 1.0 - score
|
| 114 |
+
self.logger.info("error with training data {:06.3f}".format(error))
|
| 115 |
+
result = error
|
| 116 |
+
else:
|
| 117 |
+
raise ValueError("invalid success criterion")
|
| 118 |
+
|
| 119 |
+
if modelSave:
|
| 120 |
+
self.logger.info("...saving model")
|
| 121 |
+
modelFilePath = self.getModelFilePath()
|
| 122 |
+
joblib.dump(self.classifier, modelFilePath)
|
| 123 |
+
self.trained = True
|
| 124 |
+
return result
|
| 125 |
+
|
| 126 |
+
def trainValidate(self):
|
| 127 |
+
"""
|
| 128 |
+
train with k fold validation
|
| 129 |
+
"""
|
| 130 |
+
#build model
|
| 131 |
+
self.buildModel()
|
| 132 |
+
|
| 133 |
+
# training data
|
| 134 |
+
(featData, clsData) = self.prepTrainingData()
|
| 135 |
+
|
| 136 |
+
#parameter
|
| 137 |
+
validation = self.config.getStringConfig("train.validation")[0]
|
| 138 |
+
numFolds = self.config.getIntConfig("train.num.folds")[0]
|
| 139 |
+
successCriterion = self.config.getStringConfig("train.success.criterion")[0]
|
| 140 |
+
scoreMethod = self.config.getStringConfig("train.score.method")[0]
|
| 141 |
+
|
| 142 |
+
#train with validation
|
| 143 |
+
self.logger.info("...training and kfold cross validating model")
|
| 144 |
+
scores = cross_val_score(self.classifier, featData, clsData, cv=numFolds,scoring=scoreMethod)
|
| 145 |
+
avScore = np.mean(scores)
|
| 146 |
+
result = self.reportResult(avScore, successCriterion, scoreMethod)
|
| 147 |
+
return result
|
| 148 |
+
|
| 149 |
+
def trainValidateSearch(self):
|
| 150 |
+
"""
|
| 151 |
+
train with k fold validation and search parameter space for optimum
|
| 152 |
+
"""
|
| 153 |
+
self.logger.info("...starting train validate with parameter search")
|
| 154 |
+
searchStrategyName = self.getSearchParamStrategy()
|
| 155 |
+
if searchStrategyName is not None:
|
| 156 |
+
if searchStrategyName == "grid":
|
| 157 |
+
searchStrategy = GuidedParameterSearch(self.verbose)
|
| 158 |
+
elif searchStrategyName == "random":
|
| 159 |
+
searchStrategy = RandomParameterSearch(self.verbose)
|
| 160 |
+
maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
|
| 161 |
+
searchStrategy.setMaxIter(maxIter)
|
| 162 |
+
elif searchStrategyName == "simuan":
|
| 163 |
+
searchStrategy = SimulatedAnnealingParameterSearch(self.verbose)
|
| 164 |
+
maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
|
| 165 |
+
searchStrategy.setMaxIter(maxIter)
|
| 166 |
+
temp = self.config.getFloatConfig("train.search.sa.temp")[0]
|
| 167 |
+
searchStrategy.setTemp(temp)
|
| 168 |
+
tempRedRate = self.config.getFloatConfig("train.search.sa.temp.red.rate")[0]
|
| 169 |
+
searchStrategy.setTempReductionRate(tempRedRate)
|
| 170 |
+
else:
|
| 171 |
+
raise ValueError("invalid paramtere search strategy")
|
| 172 |
+
else:
|
| 173 |
+
raise ValueError("missing search strategy")
|
| 174 |
+
|
| 175 |
+
# add search params
|
| 176 |
+
searchParams = self.config.getStringConfig("train.search.params")[0].split(",")
|
| 177 |
+
searchParamNames = []
|
| 178 |
+
extSearchParamNames = []
|
| 179 |
+
if searchParams is not None:
|
| 180 |
+
for searchParam in searchParams:
|
| 181 |
+
paramItems = searchParam.split(":")
|
| 182 |
+
extSearchParamNames.append(paramItems[0])
|
| 183 |
+
|
| 184 |
+
#get rid name component search
|
| 185 |
+
paramNameItems = paramItems[0].split(".")
|
| 186 |
+
del paramNameItems[1]
|
| 187 |
+
paramItems[0] = ".".join(paramNameItems)
|
| 188 |
+
|
| 189 |
+
searchStrategy.addParam(paramItems)
|
| 190 |
+
searchParamNames.append(paramItems[0])
|
| 191 |
+
else:
|
| 192 |
+
raise ValueError("missing search parameter list")
|
| 193 |
+
|
| 194 |
+
# add search param data list for each param
|
| 195 |
+
for (searchParamName,extSearchParamName) in zip(searchParamNames,extSearchParamNames):
|
| 196 |
+
searchParamData = self.config.getStringConfig(extSearchParamName)[0].split(",")
|
| 197 |
+
searchStrategy.addParamVaues(searchParamName, searchParamData)
|
| 198 |
+
|
| 199 |
+
# train and validate for various param value combination
|
| 200 |
+
searchStrategy.prepare()
|
| 201 |
+
paramValues = searchStrategy.nextParamValues()
|
| 202 |
+
searchResults = []
|
| 203 |
+
while paramValues is not None:
|
| 204 |
+
self.logger.info("...next parameter set")
|
| 205 |
+
paramStr = ""
|
| 206 |
+
for paramValue in paramValues:
|
| 207 |
+
self.setConfigParam(paramValue[0], str(paramValue[1]))
|
| 208 |
+
paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
|
| 209 |
+
result = self.trainValidate()
|
| 210 |
+
searchStrategy.setCost(result)
|
| 211 |
+
searchResults.append((paramStr, result))
|
| 212 |
+
paramValues = searchStrategy.nextParamValues()
|
| 213 |
+
|
| 214 |
+
# output
|
| 215 |
+
self.logger.info("all parameter search results")
|
| 216 |
+
for searchResult in searchResults:
|
| 217 |
+
self.logger.info("{}\t{06.3f}".format(searchResult[0], searchResult[1]))
|
| 218 |
+
|
| 219 |
+
self.logger.info("best parameter search result")
|
| 220 |
+
bestSolution = searchStrategy.getBestSolution()
|
| 221 |
+
paramStr = ""
|
| 222 |
+
for paramValue in bestSolution[0]:
|
| 223 |
+
paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
|
| 224 |
+
self.logger.info("{}\t{:06.3f}".format(paramStr, bestSolution[1]))
|
| 225 |
+
return bestSolution
|
| 226 |
+
|
| 227 |
+
def validate(self):
|
| 228 |
+
"""
|
| 229 |
+
predict
|
| 230 |
+
"""
|
| 231 |
+
# create model
|
| 232 |
+
useSavedModel = self.config.getBooleanConfig("validate.use.saved.model")[0]
|
| 233 |
+
if useSavedModel:
|
| 234 |
+
# load saved model
|
| 235 |
+
self.logger.info("...loading model")
|
| 236 |
+
modelFilePath = self.getModelFilePath()
|
| 237 |
+
self.classifier = joblib.load(modelFilePath)
|
| 238 |
+
else:
|
| 239 |
+
# train model
|
| 240 |
+
if not self.trained:
|
| 241 |
+
self.train()
|
| 242 |
+
|
| 243 |
+
# prepare test data
|
| 244 |
+
(featData, clsDataActual) = self.prepValidationData()
|
| 245 |
+
|
| 246 |
+
#predict
|
| 247 |
+
self.logger.info("...predicting")
|
| 248 |
+
clsDataPred = self.classifier.predict(featData)
|
| 249 |
+
|
| 250 |
+
self.logger.info("...validating")
|
| 251 |
+
#print clsData
|
| 252 |
+
scoreMethod = self.config.getStringConfig("validate.score.method")[0]
|
| 253 |
+
if scoreMethod == "accuracy":
|
| 254 |
+
accuracy = sk.metrics.accuracy_score(clsDataActual, clsDataPred)
|
| 255 |
+
self.logger.info("accuracy:")
|
| 256 |
+
self.logger.info(accuracy)
|
| 257 |
+
elif scoreMethod == "confusionMatrix":
|
| 258 |
+
confMatrx = sk.metrics.confusion_matrix(clsDataActual, clsDataPred)
|
| 259 |
+
self.logger.info("confusion matrix:")
|
| 260 |
+
self.logger.info(confMatrx)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def predictx(self):
|
| 264 |
+
"""
|
| 265 |
+
predict
|
| 266 |
+
"""
|
| 267 |
+
# create model
|
| 268 |
+
self.prepModel()
|
| 269 |
+
|
| 270 |
+
# prepare test data
|
| 271 |
+
featData = self.prepPredictData()
|
| 272 |
+
|
| 273 |
+
#predict
|
| 274 |
+
self.logger.info("...predicting")
|
| 275 |
+
clsData = self.classifier.predict(featData)
|
| 276 |
+
self.logger.info(clsData)
|
| 277 |
+
|
| 278 |
+
def predict(self, recs=None):
|
| 279 |
+
"""
|
| 280 |
+
predict with in memory data
|
| 281 |
+
"""
|
| 282 |
+
# create model
|
| 283 |
+
self.prepModel()
|
| 284 |
+
|
| 285 |
+
#input record
|
| 286 |
+
if recs:
|
| 287 |
+
#passed record
|
| 288 |
+
featData = self.prepStringPredictData(recs)
|
| 289 |
+
if (featData.ndim == 1):
|
| 290 |
+
featData = featData.reshape(1, -1)
|
| 291 |
+
else:
|
| 292 |
+
#file
|
| 293 |
+
featData = self.prepPredictData()
|
| 294 |
+
|
| 295 |
+
#predict
|
| 296 |
+
self.logger.info("...predicting")
|
| 297 |
+
clsData = self.classifier.predict(featData)
|
| 298 |
+
return clsData
|
| 299 |
+
|
| 300 |
+
def predictProb(self, recs):
|
| 301 |
+
"""
|
| 302 |
+
predict probability with in memory data
|
| 303 |
+
"""
|
| 304 |
+
raise ValueError("can not predict class probability")
|
| 305 |
+
|
| 306 |
+
def prepModel(self):
|
| 307 |
+
"""
|
| 308 |
+
preparing model
|
| 309 |
+
"""
|
| 310 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
| 311 |
+
if (useSavedModel and not self.classifier):
|
| 312 |
+
# load saved model
|
| 313 |
+
self.logger.info("...loading saved model")
|
| 314 |
+
modelFilePath = self.getModelFilePath()
|
| 315 |
+
self.classifier = joblib.load(modelFilePath)
|
| 316 |
+
else:
|
| 317 |
+
# train model
|
| 318 |
+
if not self.trained:
|
| 319 |
+
self.train()
|
| 320 |
+
|
| 321 |
+
def prepTrainingData(self):
|
| 322 |
+
"""
|
| 323 |
+
loads and prepares training data
|
| 324 |
+
"""
|
| 325 |
+
# parameters
|
| 326 |
+
dataFile = self.config.getStringConfig("train.data.file")[0]
|
| 327 |
+
fieldIndices = self.config.getStringConfig("train.data.fields")[0]
|
| 328 |
+
if not fieldIndices is None:
|
| 329 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
| 330 |
+
featFieldIndices = self.config.getStringConfig("train.data.feature.fields")[0]
|
| 331 |
+
if not featFieldIndices is None:
|
| 332 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
| 333 |
+
classFieldIndex = self.config.getIntConfig("train.data.class.field")[0]
|
| 334 |
+
|
| 335 |
+
#training data
|
| 336 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
| 337 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
| 338 |
+
scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
|
| 339 |
+
featData = scaleData(featData, scalingMethod)
|
| 340 |
+
|
| 341 |
+
clsData = extrColumns(data, classFieldIndex)
|
| 342 |
+
clsData = np.array([int(a) for a in clsData])
|
| 343 |
+
return (featData, clsData)
|
| 344 |
+
|
| 345 |
+
def prepValidationData(self):
|
| 346 |
+
"""
|
| 347 |
+
loads and prepares training data
|
| 348 |
+
"""
|
| 349 |
+
# parameters
|
| 350 |
+
dataFile = self.config.getStringConfig("validate.data.file")[0]
|
| 351 |
+
fieldIndices = self.config.getStringConfig("validate.data.fields")[0]
|
| 352 |
+
if not fieldIndices is None:
|
| 353 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
| 354 |
+
featFieldIndices = self.config.getStringConfig("validate.data.feature.fields")[0]
|
| 355 |
+
if not featFieldIndices is None:
|
| 356 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
| 357 |
+
classFieldIndex = self.config.getIntConfig("validate.data.class.field")[0]
|
| 358 |
+
|
| 359 |
+
#training data
|
| 360 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
| 361 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
| 362 |
+
scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
|
| 363 |
+
featData = scaleData(featData, scalingMethod)
|
| 364 |
+
clsData = extrColumns(data, classFieldIndex)
|
| 365 |
+
clsData = [int(a) for a in clsData]
|
| 366 |
+
return (featData, clsData)
|
| 367 |
+
|
| 368 |
+
def prepPredictData(self):
|
| 369 |
+
"""
|
| 370 |
+
loads and prepares training data
|
| 371 |
+
"""
|
| 372 |
+
# parameters
|
| 373 |
+
dataFile = self.config.getStringConfig("predict.data.file")[0]
|
| 374 |
+
if dataFile is None:
|
| 375 |
+
raise ValueError("missing prediction data file")
|
| 376 |
+
fieldIndices = self.config.getStringConfig("predict.data.fields")[0]
|
| 377 |
+
if not fieldIndices is None:
|
| 378 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
| 379 |
+
featFieldIndices = self.config.getStringConfig("predict.data.feature.fields")[0]
|
| 380 |
+
if not featFieldIndices is None:
|
| 381 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
| 382 |
+
|
| 383 |
+
#training data
|
| 384 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
| 385 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
| 386 |
+
scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
|
| 387 |
+
featData = scaleData(featData, scalingMethod)
|
| 388 |
+
|
| 389 |
+
return featData
|
| 390 |
+
|
| 391 |
+
def prepStringPredictData(self, recs):
|
| 392 |
+
"""
|
| 393 |
+
prepare string predict data
|
| 394 |
+
"""
|
| 395 |
+
frecs = StringIO(recs)
|
| 396 |
+
featData = np.loadtxt(frecs, delimiter=',')
|
| 397 |
+
return featData
|
| 398 |
+
|
| 399 |
+
def getModelFilePath(self):
|
| 400 |
+
"""
|
| 401 |
+
get model file path
|
| 402 |
+
"""
|
| 403 |
+
modelDirectory = self.config.getStringConfig("common.model.directory")[0]
|
| 404 |
+
modelFile = self.config.getStringConfig("common.model.file")[0]
|
| 405 |
+
if modelFile is None:
|
| 406 |
+
raise ValueError("missing model file name")
|
| 407 |
+
modelFilePath = modelDirectory + "/" + modelFile
|
| 408 |
+
return modelFilePath
|
| 409 |
+
|
| 410 |
+
def reportResult(self, score, successCriterion, scoreMethod):
|
| 411 |
+
"""
|
| 412 |
+
report result
|
| 413 |
+
"""
|
| 414 |
+
if successCriterion == "accuracy":
|
| 415 |
+
self.logger.info("average " + scoreMethod + " with k fold cross validation {:06.3f}".format(score))
|
| 416 |
+
result = score
|
| 417 |
+
elif successCriterion == "error":
|
| 418 |
+
error = 1.0 - score
|
| 419 |
+
self.logger.info("average error with k fold cross validation {:06.3f}".format(error))
|
| 420 |
+
result = error
|
| 421 |
+
else:
|
| 422 |
+
raise ValueError("invalid success criterion")
|
| 423 |
+
return result
|
| 424 |
+
|
| 425 |
+
def autoTrain(self):
|
| 426 |
+
"""
|
| 427 |
+
auto train
|
| 428 |
+
"""
|
| 429 |
+
maxTestErr = self.config.getFloatConfig("train.auto.max.test.error")[0]
|
| 430 |
+
maxErr = self.config.getFloatConfig("train.auto.max.error")[0]
|
| 431 |
+
maxErrDiff = self.config.getFloatConfig("train.auto.max.error.diff")[0]
|
| 432 |
+
|
| 433 |
+
self.config.setParam("train.model.save", "False")
|
| 434 |
+
|
| 435 |
+
#train, validate and serach optimum parameter
|
| 436 |
+
result = self.trainValidateSearch()
|
| 437 |
+
testError = result[1]
|
| 438 |
+
|
| 439 |
+
#subsample training size to match train size for k fold validation
|
| 440 |
+
numFolds = self.config.getIntConfig("train.num.folds")[0]
|
| 441 |
+
self.subSampleRate = float(numFolds - 1) / numFolds
|
| 442 |
+
|
| 443 |
+
#train only with optimum parameter values
|
| 444 |
+
for paramValue in result[0]:
|
| 445 |
+
pName = paramValue[0]
|
| 446 |
+
pValue = paramValue[1]
|
| 447 |
+
self.logger.info(pName + " " + pValue)
|
| 448 |
+
self.setConfigParam(pName, pValue)
|
| 449 |
+
trainError = self.train()
|
| 450 |
+
|
| 451 |
+
if testError < maxTestErr:
|
| 452 |
+
# criteria based on test error only
|
| 453 |
+
self.logger.info("Successfullt trained. Low test error level")
|
| 454 |
+
status = 1
|
| 455 |
+
else:
|
| 456 |
+
# criteria based on bias error and generalization error
|
| 457 |
+
avError = (trainError + testError) / 2
|
| 458 |
+
diffError = testError - trainError
|
| 459 |
+
self.logger.info("Auto training completed: training error {:06.3f} test error: {:06.3f}".format(trainError, testError))
|
| 460 |
+
self.logger.info("Average of test and training error: {:06.3f} test and training error diff: {:06.3f}".format(avError, diffError))
|
| 461 |
+
if diffError > maxErrDiff:
|
| 462 |
+
# high generalization error
|
| 463 |
+
if avError > maxErr:
|
| 464 |
+
# high bias error
|
| 465 |
+
self.logger.info("High generalization error and high error. Need larger training data set and increased model complexity")
|
| 466 |
+
status = 4
|
| 467 |
+
else:
|
| 468 |
+
# low bias error
|
| 469 |
+
self.logger.info("High generalization error. Need larger training data set")
|
| 470 |
+
status = 3
|
| 471 |
+
else:
|
| 472 |
+
# low generalization error
|
| 473 |
+
if avError > maxErr:
|
| 474 |
+
# high bias error
|
| 475 |
+
self.logger.info("Converged, but with high error rate. Need to increase model complexity")
|
| 476 |
+
status = 2
|
| 477 |
+
else:
|
| 478 |
+
# low bias error
|
| 479 |
+
self.logger.info("Successfullt trained. Low generalization error and low bias error level")
|
| 480 |
+
status = 1
|
| 481 |
+
|
| 482 |
+
if status == 1:
|
| 483 |
+
#train final model, use all data and save model
|
| 484 |
+
self.logger.info("...training the final model")
|
| 485 |
+
self.config.setParam("train.model.save", "True")
|
| 486 |
+
self.subSampleRate = None
|
| 487 |
+
trainError = self.train()
|
| 488 |
+
self.logger.info("training error in final model {:06.3f}".format(trainError))
|
| 489 |
+
|
| 490 |
+
return status
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
|
supv/basic_nn.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/Users/pranab/Tools/anaconda/bin/python
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import numpy as np
|
| 23 |
+
import sklearn
|
| 24 |
+
import sklearn.datasets
|
| 25 |
+
import sklearn.linear_model
|
| 26 |
+
import matplotlib
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
if len(sys.argv) != 7:
|
| 30 |
+
print "usage: <num_hidden_units> <data_set_size> <noise_in_data> <iteration_count> <learning_rate> <training_mode> "
|
| 31 |
+
sys.exit()
|
| 32 |
+
|
| 33 |
+
# number of hidden units
|
| 34 |
+
nn_hdim = int(sys.argv[1])
|
| 35 |
+
|
| 36 |
+
# dat set size
|
| 37 |
+
dsize = int(sys.argv[2])
|
| 38 |
+
|
| 39 |
+
# noise in training data
|
| 40 |
+
noise_level = float(sys.argv[3])
|
| 41 |
+
|
| 42 |
+
# iteration count
|
| 43 |
+
it_count = int(sys.argv[4])
|
| 44 |
+
|
| 45 |
+
# learning rate
|
| 46 |
+
epsilon = float(sys.argv[5])
|
| 47 |
+
|
| 48 |
+
#training mode
|
| 49 |
+
training_mode = sys.argv[6]
|
| 50 |
+
|
| 51 |
+
# validation
|
| 52 |
+
use_validation_data = True
|
| 53 |
+
|
| 54 |
+
# Generate a dataset
|
| 55 |
+
#noise_level = 0.20
|
| 56 |
+
#noise_level = 0.01
|
| 57 |
+
vlo = 100
|
| 58 |
+
vup = vlo + dsize / 5
|
| 59 |
+
vsize = vup - vlo
|
| 60 |
+
print "trainig data size %d" %(vsize)
|
| 61 |
+
np.random.seed(0)
|
| 62 |
+
XC, yc = sklearn.datasets.make_moons(dsize, noise=noise_level)
|
| 63 |
+
|
| 64 |
+
print "complete data set generated"
|
| 65 |
+
def print_array(X,y):
|
| 66 |
+
print X
|
| 67 |
+
print y
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# Generate a validation dataset
|
| 71 |
+
#np.random.seed(0)
|
| 72 |
+
#XV, yv = sklearn.datasets.make_moons(40, noise=0.20)
|
| 73 |
+
#print "validation data set generated"
|
| 74 |
+
|
| 75 |
+
XV = XC[vlo:vup:1]
|
| 76 |
+
yv = yc[vlo:vup:1]
|
| 77 |
+
print "validation data generated"
|
| 78 |
+
#print_array(XV, yv)
|
| 79 |
+
|
| 80 |
+
X = np.delete(XC, np.s_[vlo:vup:1], 0)
|
| 81 |
+
y = np.delete(yc, np.s_[vlo:vup:1], 0)
|
| 82 |
+
print "training data generated"
|
| 83 |
+
#print_array(X, y)
|
| 84 |
+
print X
|
| 85 |
+
print y
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# Parameters
|
| 89 |
+
num_examples = len(X) # training set size
|
| 90 |
+
nn_input_dim = 2 # input layer dimensionality
|
| 91 |
+
nn_output_dim = 2 # output layer dimensionality
|
| 92 |
+
|
| 93 |
+
#training data indices
|
| 94 |
+
tr_data_indices = np.arange(num_examples)
|
| 95 |
+
#print tr_data_indices
|
| 96 |
+
|
| 97 |
+
# Gradient descent parameters (I picked these by hand)
|
| 98 |
+
#epsilon = 0.01 # learning rate for gradient descent
|
| 99 |
+
reg_lambda = 0.01 # regularization strength
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# Helper function to evaluate the total loss on the dataset
|
| 103 |
+
def calculate_loss(X,y,model):
|
| 104 |
+
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
|
| 105 |
+
size = len(X)
|
| 106 |
+
|
| 107 |
+
# Forward propagation to calculate our predictions
|
| 108 |
+
z1 = X.dot(W1) + b1
|
| 109 |
+
a1 = np.tanh(z1)
|
| 110 |
+
z2 = a1.dot(W2) + b2
|
| 111 |
+
exp_scores = np.exp(z2)
|
| 112 |
+
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
|
| 113 |
+
|
| 114 |
+
# Calculating the loss
|
| 115 |
+
corect_logprobs = -np.log(probs[range(size), y])
|
| 116 |
+
data_loss = np.sum(corect_logprobs)
|
| 117 |
+
|
| 118 |
+
# Add regulatization term to loss (optional)
|
| 119 |
+
data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
|
| 120 |
+
return 1./size * data_loss
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
# Helper function to predict an output (0 or 1)
|
| 124 |
+
def predict(model, x):
|
| 125 |
+
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
|
| 126 |
+
|
| 127 |
+
# Forward propagation
|
| 128 |
+
z1 = x.dot(W1) + b1
|
| 129 |
+
a1 = np.tanh(z1)
|
| 130 |
+
z2 = a1.dot(W2) + b2
|
| 131 |
+
exp_scores = np.exp(z2)
|
| 132 |
+
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
|
| 133 |
+
return np.argmax(probs, axis=1)
|
| 134 |
+
|
| 135 |
+
# This function learns parameters for the neural network in batch mode and returns the model.
|
| 136 |
+
# - nn_hdim: Number of nodes in the hidden layer
|
| 137 |
+
# - num_passes: Number of passes through the training data for gradient descent
|
| 138 |
+
# - print_loss: If True, print the loss every 1000 iterations
|
| 139 |
+
def build_model_batch(nn_hdim, num_passes=10000, validation_interval=50):
|
| 140 |
+
# Initialize the parameters to random values. We need to learn these.
|
| 141 |
+
np.random.seed(0)
|
| 142 |
+
W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
|
| 143 |
+
b1 = np.zeros((1, nn_hdim))
|
| 144 |
+
W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
|
| 145 |
+
b2 = np.zeros((1, nn_output_dim))
|
| 146 |
+
|
| 147 |
+
# This is what we return at the end
|
| 148 |
+
model = {}
|
| 149 |
+
|
| 150 |
+
# Gradient descent. For each batch...
|
| 151 |
+
loss = -1.0
|
| 152 |
+
for i in xrange(0, num_passes):
|
| 153 |
+
#print "pass %d" %(i)
|
| 154 |
+
|
| 155 |
+
# Forward propagation
|
| 156 |
+
z1 = X.dot(W1) + b1
|
| 157 |
+
a1 = np.tanh(z1)
|
| 158 |
+
z2 = a1.dot(W2) + b2
|
| 159 |
+
exp_scores = np.exp(z2)
|
| 160 |
+
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
|
| 161 |
+
|
| 162 |
+
# Back propagation
|
| 163 |
+
delta3 = probs
|
| 164 |
+
delta3[range(num_examples), y] -= 1
|
| 165 |
+
dW2 = (a1.T).dot(delta3)
|
| 166 |
+
db2 = np.sum(delta3, axis=0, keepdims=True)
|
| 167 |
+
delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
|
| 168 |
+
dW1 = np.dot(X.T, delta2)
|
| 169 |
+
db1 = np.sum(delta2, axis=0)
|
| 170 |
+
|
| 171 |
+
# Add regularization terms (b1 and b2 don't have regularization terms)
|
| 172 |
+
dW2 += reg_lambda * W2
|
| 173 |
+
dW1 += reg_lambda * W1
|
| 174 |
+
|
| 175 |
+
# Gradient descent parameter update
|
| 176 |
+
W1 += -epsilon * dW1
|
| 177 |
+
b1 += -epsilon * db1
|
| 178 |
+
W2 += -epsilon * dW2
|
| 179 |
+
b2 += -epsilon * db2
|
| 180 |
+
|
| 181 |
+
# Assign new parameters to the model
|
| 182 |
+
model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
|
| 183 |
+
|
| 184 |
+
# This is expensive because it uses the whole dataset, so we don't want to do it too often.
|
| 185 |
+
if i % validation_interval == 0:
|
| 186 |
+
if use_validation_data:
|
| 187 |
+
cur_loss = calculate_loss(XV,yv,model)
|
| 188 |
+
else:
|
| 189 |
+
cur_loss = calculate_loss(X,y,model)
|
| 190 |
+
|
| 191 |
+
print "Loss after iteration %i: %.8f" %(i, cur_loss)
|
| 192 |
+
loss = cur_loss
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
return model
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
# This function learns parameters for the neural network in incremental and returns the model.
|
| 199 |
+
# - nn_hdim: Number of nodes in the hidden layer
|
| 200 |
+
# - num_passes: Number of passes through the training data for gradient descent
|
| 201 |
+
# - print_loss: If True, print the loss every 1000 iterations
|
| 202 |
+
def build_model_incr(nn_hdim, num_passes=10000, validation_interval=50):
|
| 203 |
+
# Initialize the parameters to random values. We need to learn these.
|
| 204 |
+
np.random.seed(0)
|
| 205 |
+
W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
|
| 206 |
+
b1 = np.zeros((1, nn_hdim))
|
| 207 |
+
W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
|
| 208 |
+
b2 = np.zeros((1, nn_output_dim))
|
| 209 |
+
|
| 210 |
+
# This is what we return at the end
|
| 211 |
+
model = {}
|
| 212 |
+
|
| 213 |
+
# gradient descent. For each batch...
|
| 214 |
+
loss = -1.0
|
| 215 |
+
for i in xrange(0, num_passes):
|
| 216 |
+
#print "pass %d" %(i)
|
| 217 |
+
|
| 218 |
+
#shuffle training data indices
|
| 219 |
+
np.random.shuffle(tr_data_indices)
|
| 220 |
+
|
| 221 |
+
# all training data
|
| 222 |
+
for j in tr_data_indices:
|
| 223 |
+
Xi = X[j].reshape(1,2)
|
| 224 |
+
yi = y[j].reshape(1)
|
| 225 |
+
|
| 226 |
+
# Forward propagation
|
| 227 |
+
z1 = Xi.dot(W1) + b1
|
| 228 |
+
a1 = np.tanh(z1)
|
| 229 |
+
z2 = a1.dot(W2) + b2
|
| 230 |
+
exp_scores = np.exp(z2)
|
| 231 |
+
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
|
| 232 |
+
|
| 233 |
+
# Back propagation
|
| 234 |
+
delta3 = probs
|
| 235 |
+
delta3[0,yi] -= 1
|
| 236 |
+
dW2 = (a1.T).dot(delta3)
|
| 237 |
+
db2 = np.sum(delta3, axis=0, keepdims=True)
|
| 238 |
+
delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
|
| 239 |
+
dW1 = np.dot(Xi.T, delta2)
|
| 240 |
+
db1 = np.sum(delta2, axis=0)
|
| 241 |
+
|
| 242 |
+
# Add regularization terms (b1 and b2 don't have regularization terms)
|
| 243 |
+
dW2 += reg_lambda * W2
|
| 244 |
+
dW1 += reg_lambda * W1
|
| 245 |
+
|
| 246 |
+
# Gradient descent parameter update
|
| 247 |
+
W1 += -epsilon * dW1
|
| 248 |
+
b1 += -epsilon * db1
|
| 249 |
+
W2 += -epsilon * dW2
|
| 250 |
+
b2 += -epsilon * db2
|
| 251 |
+
|
| 252 |
+
# Assign new parameters to the model
|
| 253 |
+
model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
|
| 254 |
+
|
| 255 |
+
# This is expensive because it uses the whole dataset, so we don't want to do it too often.
|
| 256 |
+
if i % validation_interval == 0:
|
| 257 |
+
if use_validation_data:
|
| 258 |
+
cur_loss = calculate_loss(XV,yv,model)
|
| 259 |
+
else:
|
| 260 |
+
cur_loss = calculate_loss(X,y,model)
|
| 261 |
+
|
| 262 |
+
print "Loss after iteration %i: %.8f" %(i, cur_loss)
|
| 263 |
+
loss = cur_loss
|
| 264 |
+
|
| 265 |
+
return model
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
# Build a model with a 3-dimensional hidden layer
|
| 269 |
+
if (training_mode == "batch"):
|
| 270 |
+
model = build_model_batch(nn_hdim, num_passes=it_count, validation_interval=1)
|
| 271 |
+
elif (training_mode == "incr"):
|
| 272 |
+
model = build_model_incr(nn_hdim, num_passes=it_count, validation_interval=1)
|
| 273 |
+
else:
|
| 274 |
+
print "invalid learning mode"
|
| 275 |
+
sys.exit()
|
| 276 |
+
|
| 277 |
+
print "hidden layer"
|
| 278 |
+
for row in model['W1']:
|
| 279 |
+
print(row)
|
| 280 |
+
|
| 281 |
+
print "hidden layer bias"
|
| 282 |
+
for row in model['b1']:
|
| 283 |
+
print(row)
|
| 284 |
+
|
| 285 |
+
print "output layer"
|
| 286 |
+
for row in model['W2']:
|
| 287 |
+
print(row)
|
| 288 |
+
|
| 289 |
+
print "output layer bias"
|
| 290 |
+
for row in model['b2']:
|
| 291 |
+
print(row)
|
| 292 |
+
|
| 293 |
+
|
supv/fftn.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import numpy as np
|
| 23 |
+
import torch
|
| 24 |
+
from torch.autograd import Variable
|
| 25 |
+
from torch.utils.data import Dataset, TensorDataset
|
| 26 |
+
from torch.utils.data import DataLoader
|
| 27 |
+
import sklearn as sk
|
| 28 |
+
import matplotlib
|
| 29 |
+
import random
|
| 30 |
+
import jprops
|
| 31 |
+
from random import randint
|
| 32 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 33 |
+
from util import *
|
| 34 |
+
from mlutil import *
|
| 35 |
+
from tnn import *
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class FeedForwardTwinNetwork(FeedForwardNetwork):
|
| 39 |
+
"""
|
| 40 |
+
siamese twin feef forward network
|
| 41 |
+
"""
|
| 42 |
+
def __init__(self, configFile):
|
| 43 |
+
defValues = dict()
|
| 44 |
+
defValues["train.twin.crossenc"] = (False, None)
|
| 45 |
+
super(FeedForwardTwinNetwork, self).__init__(configFile, defValues)
|
| 46 |
+
|
| 47 |
+
def buildModel(self):
|
| 48 |
+
"""
|
| 49 |
+
Loads configuration and builds the various piecess necessary for the model
|
| 50 |
+
"""
|
| 51 |
+
super().buildModel()
|
| 52 |
+
|
| 53 |
+
#final fully connected after merge
|
| 54 |
+
|
| 55 |
+
feCount = self.config.getIntConfig("train.input.size")[0]
|
| 56 |
+
self.vaFe1 = self.validFeatData[:,:feCount]
|
| 57 |
+
self.vaFe2 = self.validFeatData[:,feCount:2*feCount]
|
| 58 |
+
self.vaFe3 = self.validFeatData[:,2*feCount:]
|
| 59 |
+
|
| 60 |
+
def forward(self, x1, x2, x3):
|
| 61 |
+
"""
|
| 62 |
+
Go through layers twice
|
| 63 |
+
"""
|
| 64 |
+
y1 = self.layers(x1)
|
| 65 |
+
y2 = self.layers(x2)
|
| 66 |
+
y3 = self.layers(x3)
|
| 67 |
+
y = (y1, y2, y3)
|
| 68 |
+
return y
|
| 69 |
+
|
| 70 |
+
@staticmethod
|
| 71 |
+
def batchTrain(model):
|
| 72 |
+
"""
|
| 73 |
+
train with batch data
|
| 74 |
+
"""
|
| 75 |
+
feCount = model.config.getIntConfig("train.input.size")[0]
|
| 76 |
+
fe1 = model.featData[:,:feCount]
|
| 77 |
+
fe2 = model.featData[:,feCount:2*feCount]
|
| 78 |
+
fe3 = model.featData[:,2*feCount:]
|
| 79 |
+
|
| 80 |
+
print(fe1.shape)
|
| 81 |
+
print(fe2.shape)
|
| 82 |
+
print(fe3.shape)
|
| 83 |
+
trainData = TensorDataset(fe1, fe2, fe3)
|
| 84 |
+
trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)
|
| 85 |
+
epochIntv = model.config.getIntConfig("train.epoch.intv")[0]
|
| 86 |
+
|
| 87 |
+
# train mode
|
| 88 |
+
model.train()
|
| 89 |
+
|
| 90 |
+
if model.trackErr:
|
| 91 |
+
trErr = list()
|
| 92 |
+
vaErr = list()
|
| 93 |
+
#epoch
|
| 94 |
+
for t in range(model.numIter):
|
| 95 |
+
#batch
|
| 96 |
+
b = 0
|
| 97 |
+
epochLoss = 0.0
|
| 98 |
+
for x1Batch, x2Batch, x3Batch in trainDataLoader:
|
| 99 |
+
|
| 100 |
+
# Forward pass: Compute predicted y by passing x to the model
|
| 101 |
+
yPred = model(x1Batch, x2Batch, x3Batch)
|
| 102 |
+
|
| 103 |
+
# Compute and print loss
|
| 104 |
+
loss = model.lossFn(yPred[0], yPred[1], yPred[2])
|
| 105 |
+
if model.verbose and t % epochIntv == 0 and model.batchIntv > 0 and b % model.batchIntv == 0:
|
| 106 |
+
print("epoch {} batch {} loss {:.6f}".format(t, b, loss.item()))
|
| 107 |
+
|
| 108 |
+
if model.trackErr and model.batchIntv == 0:
|
| 109 |
+
epochLoss += loss.item()
|
| 110 |
+
|
| 111 |
+
#error tracking at batch level
|
| 112 |
+
if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:
|
| 113 |
+
trErr.append(loss.item())
|
| 114 |
+
vloss = FeedForwardTwinNetwork.evaluateModel(model)
|
| 115 |
+
vaErr.append(vloss)
|
| 116 |
+
|
| 117 |
+
# Zero gradients, perform a backward pass, and update the weights.
|
| 118 |
+
model.optimizer.zero_grad()
|
| 119 |
+
loss.backward()
|
| 120 |
+
model.optimizer.step()
|
| 121 |
+
b += 1
|
| 122 |
+
|
| 123 |
+
#error tracking at epoch level
|
| 124 |
+
if model.trackErr and model.batchIntv == 0:
|
| 125 |
+
epochLoss /= b
|
| 126 |
+
if model.verbose:
|
| 127 |
+
print("epoch {} loss {:.6f}".format(t, epochLoss))
|
| 128 |
+
trErr.append(epochLoss)
|
| 129 |
+
vloss = FeedForwardTwinNetwork.evaluateModel(model)
|
| 130 |
+
vaErr.append(vloss)
|
| 131 |
+
|
| 132 |
+
#validate
|
| 133 |
+
"""
|
| 134 |
+
model.eval()
|
| 135 |
+
yPred = model(model.vaFeOne, model.vaFeTwo)
|
| 136 |
+
yPred = yPred.data.cpu().numpy()
|
| 137 |
+
yActual = model.validOutData.data.cpu().numpy()
|
| 138 |
+
if model.verbose:
|
| 139 |
+
vsize = yPred.shape[0]
|
| 140 |
+
print("\npredicted \t\t actual")
|
| 141 |
+
for i in range(vsize):
|
| 142 |
+
print(str(yPred[i]) + "\t" + str(yActual[i]))
|
| 143 |
+
|
| 144 |
+
score = perfMetric(model.accMetric, yActual, yPred)
|
| 145 |
+
print(yActual)
|
| 146 |
+
print(yPred)
|
| 147 |
+
print(formatFloat(3, score, "perf score"))
|
| 148 |
+
"""
|
| 149 |
+
|
| 150 |
+
#save
|
| 151 |
+
modelSave = model.config.getBooleanConfig("train.model.save")[0]
|
| 152 |
+
if modelSave:
|
| 153 |
+
FeedForwardNetwork.saveCheckpt(model)
|
| 154 |
+
|
| 155 |
+
if model.trackErr:
|
| 156 |
+
FeedForwardNetwork.errorPlot(model, trErr, vaErr)
|
| 157 |
+
|
| 158 |
+
return 1.0
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
@staticmethod
|
| 162 |
+
def evaluateModel(model):
|
| 163 |
+
"""
|
| 164 |
+
evaluate model
|
| 165 |
+
|
| 166 |
+
Parameters
|
| 167 |
+
model : torch model
|
| 168 |
+
"""
|
| 169 |
+
model.eval()
|
| 170 |
+
with torch.no_grad():
|
| 171 |
+
yPred = model(model.vaFe1, model.vaFe2, model.vaFe3)
|
| 172 |
+
score = model.lossFn(yPred[0], yPred[1], yPred[2]).item()
|
| 173 |
+
model.train()
|
| 174 |
+
return score
|
| 175 |
+
|
| 176 |
+
@staticmethod
|
| 177 |
+
def testModel(model):
|
| 178 |
+
"""
|
| 179 |
+
test model
|
| 180 |
+
|
| 181 |
+
Parameters
|
| 182 |
+
model : torch model
|
| 183 |
+
"""
|
| 184 |
+
useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
|
| 185 |
+
if useSavedModel:
|
| 186 |
+
FeedForwardNetwork.restoreCheckpt(model)
|
| 187 |
+
else:
|
| 188 |
+
FeedForwardTwinNetwork.batchTrain(model)
|
| 189 |
+
|
| 190 |
+
dataSource = model.config.getStringConfig("predict.data.file")[0]
|
| 191 |
+
featData = FeedForwardNetwork.prepData(model, dataSource, False)
|
| 192 |
+
featData = torch.from_numpy(featData)
|
| 193 |
+
feCount = model.config.getIntConfig("train.input.size")[0]
|
| 194 |
+
fe1 = featData[:,:feCount]
|
| 195 |
+
fe2 = featData[:,feCount:2*feCount]
|
| 196 |
+
fe3 = featData[:,2*feCount:]
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
model.eval()
|
| 200 |
+
with torch.no_grad():
|
| 201 |
+
yp = model(fe1, fe2, fe3)
|
| 202 |
+
cos = torch.nn.CosineSimilarity()
|
| 203 |
+
s1 = cos(yp[0], yp[1]).data.cpu().numpy()
|
| 204 |
+
s2 = cos(yp[0], yp[2]).data.cpu().numpy()
|
| 205 |
+
#print(s1.shape)
|
| 206 |
+
|
| 207 |
+
n = yp[0].shape[0]
|
| 208 |
+
if model.verbose:
|
| 209 |
+
print(n)
|
| 210 |
+
for i in range(15):
|
| 211 |
+
if i % 3 == 0:
|
| 212 |
+
print("next")
|
| 213 |
+
print(yp[0][i])
|
| 214 |
+
print(yp[1][i])
|
| 215 |
+
print(yp[2][i])
|
| 216 |
+
print("similarity {:.3f} {:.3f}".format(s1[i], s2[i]))
|
| 217 |
+
|
| 218 |
+
tc = 0
|
| 219 |
+
cc = 0
|
| 220 |
+
outputSize = model.config.getIntConfig("train.output.size")[0]
|
| 221 |
+
for i in range(0, n, outputSize):
|
| 222 |
+
#for each sample outputSize no of rows
|
| 223 |
+
msi = None
|
| 224 |
+
imsi = None
|
| 225 |
+
for j in range(outputSize):
|
| 226 |
+
#first one positive , followed by all negative
|
| 227 |
+
si = (s1[i+j] + s2[i+j]) / 2
|
| 228 |
+
if msi == None or si > msi:
|
| 229 |
+
msi = si
|
| 230 |
+
imsi = j
|
| 231 |
+
tc += 1
|
| 232 |
+
if imsi == 0:
|
| 233 |
+
cc += 1
|
| 234 |
+
score = cc / tc
|
| 235 |
+
print("score: {:.3f}".format(score))
|
| 236 |
+
model.train()
|
| 237 |
+
return score
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
|
supv/gbt.py
ADDED
|
@@ -0,0 +1,482 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import numpy as np
|
| 23 |
+
import sklearn as sk
|
| 24 |
+
import matplotlib
|
| 25 |
+
import random
|
| 26 |
+
import jprops
|
| 27 |
+
from sklearn.ensemble import GradientBoostingClassifier
|
| 28 |
+
import joblib
|
| 29 |
+
from sklearn.metrics import accuracy_score
|
| 30 |
+
from sklearn.metrics import confusion_matrix
|
| 31 |
+
from sklearn.model_selection import cross_val_score
|
| 32 |
+
from random import randint
|
| 33 |
+
from io import StringIO
|
| 34 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 35 |
+
from util import *
|
| 36 |
+
from mlutil import *
|
| 37 |
+
from pasearch import *
|
| 38 |
+
from bacl import *
|
| 39 |
+
|
| 40 |
+
# gradient boosting classification
|
| 41 |
+
class GradientBoostedTrees(object):
|
| 42 |
+
def __init__(self, configFile):
|
| 43 |
+
defValues = {}
|
| 44 |
+
defValues["common.mode"] = ("training", None)
|
| 45 |
+
defValues["common.model.directory"] = ("model", None)
|
| 46 |
+
defValues["common.model.file"] = (None, None)
|
| 47 |
+
defValues["common.preprocessing"] = (None, None)
|
| 48 |
+
defValues["common.verbose"] = (False, None)
|
| 49 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
| 50 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
| 51 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
| 52 |
+
defValues["train.data.class.field"] = (None, "missing class field ordinal")
|
| 53 |
+
defValues["train.validation"] = ("kfold", None)
|
| 54 |
+
defValues["train.num.folds"] = (5, None)
|
| 55 |
+
defValues["train.min.samples.split"] = ("4", None)
|
| 56 |
+
defValues["train.min.samples.leaf.gb"] = ("2", None)
|
| 57 |
+
defValues["train.max.depth.gb"] = (3, None)
|
| 58 |
+
defValues["train.max.leaf.nodes.gb"] = (None, None)
|
| 59 |
+
defValues["train.max.features.gb"] = (None, None)
|
| 60 |
+
defValues["train.learning.rate"] = (0.1, None)
|
| 61 |
+
defValues["train.num.estimators.gb"] = (100, None)
|
| 62 |
+
defValues["train.subsample"] = (1.0, None)
|
| 63 |
+
defValues["train.loss"] = ("deviance", None)
|
| 64 |
+
defValues["train.random.state"] = (None, None)
|
| 65 |
+
defValues["train.verbose"] = (0, None)
|
| 66 |
+
defValues["train.warm.start"] = (False, None)
|
| 67 |
+
defValues["train.presort"] = ("auto", None)
|
| 68 |
+
defValues["train.criterion"] = ("friedman_mse", None)
|
| 69 |
+
defValues["train.success.criterion"] = ("error", None)
|
| 70 |
+
defValues["train.model.save"] = (False, None)
|
| 71 |
+
defValues["train.score.method"] = ("accuracy", None)
|
| 72 |
+
defValues["train.search.param.strategy"] = (None, None)
|
| 73 |
+
defValues["train.search.params"] = (None, None)
|
| 74 |
+
defValues["predict.data.file"] = (None, None)
|
| 75 |
+
defValues["predict.data.fields"] = (None, "missing data field ordinals")
|
| 76 |
+
defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
|
| 77 |
+
defValues["predict.use.saved.model"] = (False, None)
|
| 78 |
+
defValues["validate.data.file"] = (None, "missing validation data file")
|
| 79 |
+
defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
|
| 80 |
+
defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
|
| 81 |
+
defValues["validate.data.class.field"] = (None, "missing class field ordinal")
|
| 82 |
+
defValues["validate.use.saved.model"] = (False, None)
|
| 83 |
+
defValues["validate.score.method"] = ("accuracy", None)
|
| 84 |
+
|
| 85 |
+
self.config = Configuration(configFile, defValues)
|
| 86 |
+
self.subSampleRate = None
|
| 87 |
+
self.featData = None
|
| 88 |
+
self.clsData = None
|
| 89 |
+
self.gbcClassifier = None
|
| 90 |
+
self.verbose = self.config.getBooleanConfig("common.verbose")[0]
|
| 91 |
+
logFilePath = self.config.getStringConfig("common.logging.file")[0]
|
| 92 |
+
logLevName = self.config.getStringConfig("common.logging.level")[0]
|
| 93 |
+
self.logger = createLogger(__name__, logFilePath, logLevName)
|
| 94 |
+
self.logger.info("********* starting session")
|
| 95 |
+
|
| 96 |
+
# initialize config
|
| 97 |
+
def initConfig(self, configFile, defValues):
|
| 98 |
+
self.config = Configuration(configFile, defValues)
|
| 99 |
+
|
| 100 |
+
# get config object
|
| 101 |
+
def getConfig(self):
|
| 102 |
+
return self.config
|
| 103 |
+
|
| 104 |
+
#set config param
|
| 105 |
+
def setConfigParam(self, name, value):
|
| 106 |
+
self.config.setParam(name, value)
|
| 107 |
+
|
| 108 |
+
#get mode
|
| 109 |
+
def getMode(self):
|
| 110 |
+
return self.config.getStringConfig("common.mode")[0]
|
| 111 |
+
|
| 112 |
+
#get search parameter
|
| 113 |
+
def getSearchParamStrategy(self):
|
| 114 |
+
return self.config.getStringConfig("train.search.param.strategy")[0]
|
| 115 |
+
|
| 116 |
+
def setModel(self, model):
|
| 117 |
+
self.gbcClassifier = model
|
| 118 |
+
|
| 119 |
+
# train model
|
| 120 |
+
def train(self):
|
| 121 |
+
#build model
|
| 122 |
+
self.buildModel()
|
| 123 |
+
|
| 124 |
+
# training data
|
| 125 |
+
if self.featData is None:
|
| 126 |
+
(featData, clsData) = self.prepTrainingData()
|
| 127 |
+
(self.featData, self.clsData) = (featData, clsData)
|
| 128 |
+
else:
|
| 129 |
+
(featData, clsData) = (self.featData, self.clsData)
|
| 130 |
+
if self.subSampleRate is not None:
|
| 131 |
+
(featData, clsData) = subSample(featData, clsData, self.subSampleRate, False)
|
| 132 |
+
self.logger.info("subsample size " + str(featData.shape[0]))
|
| 133 |
+
|
| 134 |
+
# parameters
|
| 135 |
+
modelSave = self.config.getBooleanConfig("train.model.save")[0]
|
| 136 |
+
|
| 137 |
+
#train
|
| 138 |
+
self.logger.info("...training model")
|
| 139 |
+
self.gbcClassifier.fit(featData, clsData)
|
| 140 |
+
score = self.gbcClassifier.score(featData, clsData)
|
| 141 |
+
successCriterion = self.config.getStringConfig("train.success.criterion")[0]
|
| 142 |
+
result = None
|
| 143 |
+
if successCriterion == "accuracy":
|
| 144 |
+
self.logger.info("accuracy with training data {:06.3f}".format(score))
|
| 145 |
+
result = score
|
| 146 |
+
elif successCriterion == "error":
|
| 147 |
+
error = 1.0 - score
|
| 148 |
+
self.logger.info("error with training data {:06.3f}".format(error))
|
| 149 |
+
result = error
|
| 150 |
+
else:
|
| 151 |
+
raise ValueError("invalid success criterion")
|
| 152 |
+
|
| 153 |
+
if modelSave:
|
| 154 |
+
self.logger.info("...saving model")
|
| 155 |
+
modelFilePath = self.getModelFilePath()
|
| 156 |
+
joblib.dump(self.gbcClassifier, modelFilePath)
|
| 157 |
+
return result
|
| 158 |
+
|
| 159 |
+
#train with k fold validation
|
| 160 |
+
def trainValidate(self):
|
| 161 |
+
#build model
|
| 162 |
+
self.buildModel()
|
| 163 |
+
|
| 164 |
+
# training data
|
| 165 |
+
(featData, clsData) = self.prepTrainingData()
|
| 166 |
+
|
| 167 |
+
#parameter
|
| 168 |
+
validation = self.config.getStringConfig("train.validation")[0]
|
| 169 |
+
numFolds = self.config.getIntConfig("train.num.folds")[0]
|
| 170 |
+
successCriterion = self.config.getStringConfig("train.success.criterion")[0]
|
| 171 |
+
scoreMethod = self.config.getStringConfig("train.score.method")[0]
|
| 172 |
+
|
| 173 |
+
#train with validation
|
| 174 |
+
self.logger.info("...training and kfold cross validating model")
|
| 175 |
+
scores = cross_val_score(self.gbcClassifier, featData, clsData, cv=numFolds,scoring=scoreMethod)
|
| 176 |
+
avScore = np.mean(scores)
|
| 177 |
+
result = self.reportResult(avScore, successCriterion, scoreMethod)
|
| 178 |
+
return result
|
| 179 |
+
|
| 180 |
+
#train with k fold validation and search parameter space for optimum
|
| 181 |
+
def trainValidateSearch(self):
|
| 182 |
+
self.logger.info("...starting train validate with parameter search")
|
| 183 |
+
searchStrategyName = self.getSearchParamStrategy()
|
| 184 |
+
if searchStrategyName is not None:
|
| 185 |
+
if searchStrategyName == "grid":
|
| 186 |
+
searchStrategy = GuidedParameterSearch(self.verbose)
|
| 187 |
+
elif searchStrategyName == "random":
|
| 188 |
+
searchStrategy = RandomParameterSearch(self.verbose)
|
| 189 |
+
maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
|
| 190 |
+
searchStrategy.setMaxIter(maxIter)
|
| 191 |
+
elif searchStrategyName == "simuan":
|
| 192 |
+
searchStrategy = SimulatedAnnealingParameterSearch(self.verbose)
|
| 193 |
+
maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
|
| 194 |
+
searchStrategy.setMaxIter(maxIter)
|
| 195 |
+
temp = self.config.getFloatConfig("train.search.sa.temp")[0]
|
| 196 |
+
searchStrategy.setTemp(temp)
|
| 197 |
+
tempRedRate = self.config.getFloatConfig("train.search.sa.temp.red.rate")[0]
|
| 198 |
+
searchStrategy.setTempReductionRate(tempRedRate)
|
| 199 |
+
else:
|
| 200 |
+
raise ValueError("invalid paramtere search strategy")
|
| 201 |
+
else:
|
| 202 |
+
raise ValueError("missing search strategy")
|
| 203 |
+
|
| 204 |
+
# add search params
|
| 205 |
+
searchParams = self.config.getStringConfig("train.search.params")[0].split(",")
|
| 206 |
+
searchParamNames = []
|
| 207 |
+
extSearchParamNames = []
|
| 208 |
+
if searchParams is not None:
|
| 209 |
+
for searchParam in searchParams:
|
| 210 |
+
paramItems = searchParam.split(":")
|
| 211 |
+
extSearchParamNames.append(paramItems[0])
|
| 212 |
+
|
| 213 |
+
#get rid name component search
|
| 214 |
+
paramNameItems = paramItems[0].split(".")
|
| 215 |
+
del paramNameItems[1]
|
| 216 |
+
paramItems[0] = ".".join(paramNameItems)
|
| 217 |
+
|
| 218 |
+
searchStrategy.addParam(paramItems)
|
| 219 |
+
searchParamNames.append(paramItems[0])
|
| 220 |
+
else:
|
| 221 |
+
raise ValueError("missing search parameter list")
|
| 222 |
+
|
| 223 |
+
# add search param data list for each param
|
| 224 |
+
for (searchParamName,extSearchParamName) in zip(searchParamNames,extSearchParamNames):
|
| 225 |
+
searchParamData = self.config.getStringConfig(extSearchParamName)[0].split(",")
|
| 226 |
+
searchStrategy.addParamVaues(searchParamName, searchParamData)
|
| 227 |
+
|
| 228 |
+
# train and validate for various param value combination
|
| 229 |
+
searchStrategy.prepare()
|
| 230 |
+
paramValues = searchStrategy.nextParamValues()
|
| 231 |
+
searchResults = []
|
| 232 |
+
while paramValues is not None:
|
| 233 |
+
self.logger.info("...next parameter set")
|
| 234 |
+
paramStr = ""
|
| 235 |
+
for paramValue in paramValues:
|
| 236 |
+
self.setConfigParam(paramValue[0], str(paramValue[1]))
|
| 237 |
+
paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
|
| 238 |
+
result = self.trainValidate()
|
| 239 |
+
searchStrategy.setCost(result)
|
| 240 |
+
searchResults.append((paramStr, result))
|
| 241 |
+
paramValues = searchStrategy.nextParamValues()
|
| 242 |
+
|
| 243 |
+
# output
|
| 244 |
+
self.logger.info("all parameter search results")
|
| 245 |
+
for searchResult in searchResults:
|
| 246 |
+
self.logger.info("{}\t{:06.3f}".format(searchResult[0], searchResult[1]))
|
| 247 |
+
|
| 248 |
+
self.logger.info("best parameter search result")
|
| 249 |
+
bestSolution = searchStrategy.getBestSolution()
|
| 250 |
+
paramStr = ""
|
| 251 |
+
for paramValue in bestSolution[0]:
|
| 252 |
+
paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
|
| 253 |
+
self.logger.info("{}\t{:06.3f}".format(paramStr, bestSolution[1]))
|
| 254 |
+
return bestSolution
|
| 255 |
+
|
| 256 |
+
#predict
|
| 257 |
+
def validate(self):
|
| 258 |
+
# create model
|
| 259 |
+
useSavedModel = self.config.getBooleanConfig("validate.use.saved.model")[0]
|
| 260 |
+
if useSavedModel:
|
| 261 |
+
# load saved model
|
| 262 |
+
self.logger.info("...loading model")
|
| 263 |
+
modelFilePath = self.getModelFilePath()
|
| 264 |
+
self.gbcClassifier = joblib.load(modelFilePath)
|
| 265 |
+
else:
|
| 266 |
+
# train model
|
| 267 |
+
self.train()
|
| 268 |
+
|
| 269 |
+
# prepare test data
|
| 270 |
+
(featData, clsDataActual) = self.prepValidationData()
|
| 271 |
+
|
| 272 |
+
#predict
|
| 273 |
+
self.logger.info("...predicting")
|
| 274 |
+
clsDataPred = self.gbcClassifier.predict(featData)
|
| 275 |
+
|
| 276 |
+
self.logger.info("...validating")
|
| 277 |
+
#self.logger.info(clsData)
|
| 278 |
+
scoreMethod = self.config.getStringConfig("validate.score.method")[0]
|
| 279 |
+
if scoreMethod == "accuracy":
|
| 280 |
+
accuracy = accuracy_score(clsDataActual, clsDataPred)
|
| 281 |
+
self.logger.info("accuracy:")
|
| 282 |
+
self.logger.info(accuracy)
|
| 283 |
+
elif scoreMethod == "confusionMatrix":
|
| 284 |
+
confMatrx = confusion_matrix(clsDataActual, clsDataPred)
|
| 285 |
+
self.logger.info("confusion matrix:")
|
| 286 |
+
self.logger.info(confMatrx)
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
#predict
|
| 290 |
+
def predictx(self):
|
| 291 |
+
# create model
|
| 292 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
| 293 |
+
if useSavedModel:
|
| 294 |
+
# load saved model
|
| 295 |
+
self.logger.info("...loading model")
|
| 296 |
+
modelFilePath = self.getModelFilePath()
|
| 297 |
+
self.gbcClassifier = joblib.load(modelFilePath)
|
| 298 |
+
else:
|
| 299 |
+
# train model
|
| 300 |
+
self.train()
|
| 301 |
+
|
| 302 |
+
# prepare test data
|
| 303 |
+
featData = self.prepPredictData()
|
| 304 |
+
|
| 305 |
+
#predict
|
| 306 |
+
self.logger.info("...predicting")
|
| 307 |
+
clsData = self.gbcClassifier.predict(featData)
|
| 308 |
+
self.logger.info(clsData)
|
| 309 |
+
|
| 310 |
+
#predict with in memory data
|
| 311 |
+
def predict(self, recs=None):
|
| 312 |
+
# create model
|
| 313 |
+
self.prepModel()
|
| 314 |
+
|
| 315 |
+
#input record
|
| 316 |
+
#input record
|
| 317 |
+
if recs:
|
| 318 |
+
#passed record
|
| 319 |
+
featData = self.prepStringPredictData(recs)
|
| 320 |
+
if (featData.ndim == 1):
|
| 321 |
+
featData = featData.reshape(1, -1)
|
| 322 |
+
else:
|
| 323 |
+
#file
|
| 324 |
+
featData = self.prepPredictData()
|
| 325 |
+
|
| 326 |
+
#predict
|
| 327 |
+
self.logger.info("...predicting")
|
| 328 |
+
clsData = self.gbcClassifier.predict(featData)
|
| 329 |
+
return clsData
|
| 330 |
+
|
| 331 |
+
#predict probability with in memory data
|
| 332 |
+
def predictProb(self, recs):
|
| 333 |
+
# create model
|
| 334 |
+
self.prepModel()
|
| 335 |
+
|
| 336 |
+
#input record
|
| 337 |
+
if type(recs) is str:
|
| 338 |
+
featData = self.prepStringPredictData(recs)
|
| 339 |
+
else:
|
| 340 |
+
featData = recs
|
| 341 |
+
#self.logger.info(featData.shape)
|
| 342 |
+
if (featData.ndim == 1):
|
| 343 |
+
featData = featData.reshape(1, -1)
|
| 344 |
+
|
| 345 |
+
#predict
|
| 346 |
+
self.logger.info("...predicting class probability")
|
| 347 |
+
clsData = self.gbcClassifier.predict_proba(featData)
|
| 348 |
+
return clsData
|
| 349 |
+
|
| 350 |
+
#preparing model
|
| 351 |
+
def prepModel(self):
|
| 352 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
| 353 |
+
if (useSavedModel and not self.gbcClassifier):
|
| 354 |
+
# load saved model
|
| 355 |
+
self.logger.info("...loading saved model")
|
| 356 |
+
modelFilePath = self.getModelFilePath()
|
| 357 |
+
self.gbcClassifier = joblib.load(modelFilePath)
|
| 358 |
+
else:
|
| 359 |
+
# train model
|
| 360 |
+
self.train()
|
| 361 |
+
return self.gbcClassifier
|
| 362 |
+
|
| 363 |
+
#prepare string predict data
|
| 364 |
+
def prepStringPredictData(self, recs):
|
| 365 |
+
frecs = StringIO(recs)
|
| 366 |
+
featData = np.loadtxt(frecs, delimiter=',')
|
| 367 |
+
#self.logger.info(featData)
|
| 368 |
+
return featData
|
| 369 |
+
|
| 370 |
+
#loads and prepares training data
|
| 371 |
+
def prepTrainingData(self):
|
| 372 |
+
# parameters
|
| 373 |
+
dataFile = self.config.getStringConfig("train.data.file")[0]
|
| 374 |
+
fieldIndices = self.config.getStringConfig("train.data.fields")[0]
|
| 375 |
+
if not fieldIndices is None:
|
| 376 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
| 377 |
+
featFieldIndices = self.config.getStringConfig("train.data.feature.fields")[0]
|
| 378 |
+
if not featFieldIndices is None:
|
| 379 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
| 380 |
+
classFieldIndex = self.config.getIntConfig("train.data.class.field")[0]
|
| 381 |
+
|
| 382 |
+
#training data
|
| 383 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
| 384 |
+
clsData = extrColumns(data, classFieldIndex)
|
| 385 |
+
clsData = np.array([int(a) for a in clsData])
|
| 386 |
+
return (featData, clsData)
|
| 387 |
+
|
| 388 |
+
#loads and prepares training data
|
| 389 |
+
def prepValidationData(self):
|
| 390 |
+
# parameters
|
| 391 |
+
dataFile = self.config.getStringConfig("validate.data.file")[0]
|
| 392 |
+
fieldIndices = self.config.getStringConfig("validate.data.fields")[0]
|
| 393 |
+
if not fieldIndices is None:
|
| 394 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
| 395 |
+
featFieldIndices = self.config.getStringConfig("validate.data.feature.fields")[0]
|
| 396 |
+
if not featFieldIndices is None:
|
| 397 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
| 398 |
+
classFieldIndex = self.config.getIntConfig("validate.data.class.field")[0]
|
| 399 |
+
|
| 400 |
+
#training data
|
| 401 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
| 402 |
+
clsData = extrColumns(data, classFieldIndex)
|
| 403 |
+
clsData = [int(a) for a in clsData]
|
| 404 |
+
return (featData, clsData)
|
| 405 |
+
|
| 406 |
+
#loads and prepares training data
|
| 407 |
+
def prepPredictData(self):
|
| 408 |
+
# parameters
|
| 409 |
+
dataFile = self.config.getStringConfig("predict.data.file")[0]
|
| 410 |
+
if dataFile is None:
|
| 411 |
+
raise ValueError("missing prediction data file")
|
| 412 |
+
fieldIndices = self.config.getStringConfig("predict.data.fields")[0]
|
| 413 |
+
if not fieldIndices is None:
|
| 414 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
| 415 |
+
featFieldIndices = self.config.getStringConfig("predict.data.feature.fields")[0]
|
| 416 |
+
if not featFieldIndices is None:
|
| 417 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
| 418 |
+
|
| 419 |
+
#training data
|
| 420 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
| 421 |
+
|
| 422 |
+
return featData
|
| 423 |
+
|
| 424 |
+
# get model file path
|
| 425 |
+
def getModelFilePath(self):
|
| 426 |
+
modelDirectory = self.config.getStringConfig("common.model.directory")[0]
|
| 427 |
+
modelFile = self.config.getStringConfig("common.model.file")[0]
|
| 428 |
+
if modelFile is None:
|
| 429 |
+
raise ValueError("missing model file name")
|
| 430 |
+
modelFilePath = modelDirectory + "/" + modelFile
|
| 431 |
+
return modelFilePath
|
| 432 |
+
|
| 433 |
+
# report result
|
| 434 |
+
def reportResult(self, score, successCriterion, scoreMethod):
|
| 435 |
+
if successCriterion == "accuracy":
|
| 436 |
+
self.logger.info("average " + scoreMethod + " with k fold cross validation {:06.3f}".format(score))
|
| 437 |
+
result = score
|
| 438 |
+
elif successCriterion == "error":
|
| 439 |
+
error = 1.0 - score
|
| 440 |
+
self.logger.info("average error with k fold cross validation {:06.3f}".format(error))
|
| 441 |
+
result = error
|
| 442 |
+
else:
|
| 443 |
+
raise ValueError("invalid success criterion")
|
| 444 |
+
return result
|
| 445 |
+
|
| 446 |
+
# builds model object
|
| 447 |
+
def buildModel(self):
|
| 448 |
+
self.logger.info("...building gradient boosted tree model")
|
| 449 |
+
# parameters
|
| 450 |
+
minSamplesSplit = self.config.getStringConfig("train.min.samples.split")[0]
|
| 451 |
+
minSamplesSplit = typedValue(minSamplesSplit)
|
| 452 |
+
minSamplesLeaf = self.config.getStringConfig("train.min.samples.leaf.gb")[0]
|
| 453 |
+
minSamplesLeaf = typedValue(minSamplesLeaf)
|
| 454 |
+
#minWeightFractionLeaf = self.config.getFloatConfig("train.min.weight.fraction.leaf.gb")[0]
|
| 455 |
+
(maxDepth, maxLeafNodes) = self.config.eitherOrIntConfig("train.max.depth.gb", "train.max.leaf.nodes.gb")
|
| 456 |
+
maxFeatures = self.config.getStringConfig("train.max.features.gb")[0]
|
| 457 |
+
maxFeatures = typedValue(maxFeatures)
|
| 458 |
+
learningRate = self.config.getFloatConfig("train.learning.rate")[0]
|
| 459 |
+
numEstimators = self.config.getIntConfig("train.num.estimators.gb")[0]
|
| 460 |
+
subsampleFraction = self.config.getFloatConfig("train.subsample")[0]
|
| 461 |
+
lossFun = self.config.getStringConfig("train.loss")[0]
|
| 462 |
+
randomState = self.config.getIntConfig("train.random.state")[0]
|
| 463 |
+
verboseOutput = self.config.getIntConfig("train.verbose")[0]
|
| 464 |
+
warmStart = self.config.getBooleanConfig("train.warm.start")[0]
|
| 465 |
+
presort = self.config.getStringConfig("train.presort")
|
| 466 |
+
if (presort[1]):
|
| 467 |
+
presortChoice = presort[0]
|
| 468 |
+
else:
|
| 469 |
+
presortChoice = presort[0].lower() == "true"
|
| 470 |
+
splitCriterion = self.config.getStringConfig("train.criterion")[0]
|
| 471 |
+
|
| 472 |
+
#classifier
|
| 473 |
+
self.gbcClassifier = GradientBoostingClassifier(loss=lossFun, learning_rate=learningRate, n_estimators=numEstimators,
|
| 474 |
+
subsample=subsampleFraction, min_samples_split=minSamplesSplit,
|
| 475 |
+
min_samples_leaf=minSamplesLeaf, min_weight_fraction_leaf=0.0, max_depth=maxDepth,
|
| 476 |
+
init=None, random_state=randomState, max_features=maxFeatures, verbose=verboseOutput,
|
| 477 |
+
max_leaf_nodes=maxLeafNodes, warm_start=warmStart, presort=presortChoice)
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
|
supv/gcn.py
ADDED
|
@@ -0,0 +1,444 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import matplotlib
|
| 23 |
+
import random
|
| 24 |
+
from random import randint
|
| 25 |
+
from itertools import compress
|
| 26 |
+
import numpy as np
|
| 27 |
+
import torch
|
| 28 |
+
from torch import nn
|
| 29 |
+
from torch.nn import Linear
|
| 30 |
+
from torch.autograd import Variable
|
| 31 |
+
from torch.utils.data import DataLoader
|
| 32 |
+
from torchvision import transforms
|
| 33 |
+
from torch_geometric.nn import GCNConv
|
| 34 |
+
from torch_geometric.nn import MessagePassing
|
| 35 |
+
from torch_geometric.data import Data
|
| 36 |
+
import sklearn as sk
|
| 37 |
+
import jprops
|
| 38 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 39 |
+
from util import *
|
| 40 |
+
from mlutil import *
|
| 41 |
+
from tnn import FeedForwardNetwork
|
| 42 |
+
|
| 43 |
+
"""
|
| 44 |
+
Graph convolution network
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
class GraphConvoNetwork(nn.Module):
|
| 48 |
+
def __init__(self, configFile):
|
| 49 |
+
"""
|
| 50 |
+
initilizer
|
| 51 |
+
|
| 52 |
+
Parameters
|
| 53 |
+
configFile : config file path
|
| 54 |
+
"""
|
| 55 |
+
defValues = dict()
|
| 56 |
+
defValues["common.model.directory"] = ("model", None)
|
| 57 |
+
defValues["common.model.file"] = (None, None)
|
| 58 |
+
defValues["common.preprocessing"] = (None, None)
|
| 59 |
+
defValues["common.scaling.method"] = ("zscale", None)
|
| 60 |
+
defValues["common.scaling.minrows"] = (50, None)
|
| 61 |
+
defValues["common.scaling.param.file"] = (None, None)
|
| 62 |
+
defValues["common.verbose"] = (False, None)
|
| 63 |
+
defValues["common.device"] = ("cpu", None)
|
| 64 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
| 65 |
+
defValues["train.data.num.nodes.total"] = (None, None)
|
| 66 |
+
defValues["train.data.num.nodes.training"] = (None, None)
|
| 67 |
+
defValues["train.data.splits"] = ([.75,.15,.10], None)
|
| 68 |
+
defValues["train.layer.data"] = (None, "missing layer data")
|
| 69 |
+
defValues["train.input.size"] = (None, "missing output size")
|
| 70 |
+
defValues["train.output.size"] = (None, "missing output size")
|
| 71 |
+
defValues["train.loss.reduction"] = ("mean", None)
|
| 72 |
+
defValues["train.num.iterations"] = (500, None)
|
| 73 |
+
defValues["train.lossFn"] = ("mse", None)
|
| 74 |
+
defValues["train.optimizer"] = ("sgd", None)
|
| 75 |
+
defValues["train.opt.learning.rate"] = (.0001, None)
|
| 76 |
+
defValues["train.opt.weight.decay"] = (0, None)
|
| 77 |
+
defValues["train.opt.momentum"] = (0, None)
|
| 78 |
+
defValues["train.opt.eps"] = (1e-08, None)
|
| 79 |
+
defValues["train.opt.dampening"] = (0, None)
|
| 80 |
+
defValues["train.opt.momentum.nesterov"] = (False, None)
|
| 81 |
+
defValues["train.opt.betas"] = ([0.9, 0.999], None)
|
| 82 |
+
defValues["train.opt.alpha"] = (0.99, None)
|
| 83 |
+
defValues["train.save.model"] = (False, None)
|
| 84 |
+
defValues["train.track.error"] = (False, None)
|
| 85 |
+
defValues["train.epoch.intv"] = (5, None)
|
| 86 |
+
defValues["train.print.weights"] = (False, None)
|
| 87 |
+
defValues["valid.accuracy.metric"] = (None, None)
|
| 88 |
+
defValues["predict.create.mask"] = (False, None)
|
| 89 |
+
defValues["predict.use.saved.model"] = (True, None)
|
| 90 |
+
|
| 91 |
+
self.config = Configuration(configFile, defValues)
|
| 92 |
+
super(GraphConvoNetwork, self).__init__()
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def getConfig(self):
|
| 96 |
+
"""
|
| 97 |
+
return config
|
| 98 |
+
"""
|
| 99 |
+
return self.config
|
| 100 |
+
|
| 101 |
+
def buildModel(self):
|
| 102 |
+
"""
|
| 103 |
+
Loads configuration and builds the various piecess necessary for the model
|
| 104 |
+
"""
|
| 105 |
+
torch.manual_seed(9999)
|
| 106 |
+
|
| 107 |
+
self.verbose = self.config.getBooleanConfig("common.verbose")[0]
|
| 108 |
+
numinp = self.config.getIntConfig("train.input.size")[0]
|
| 109 |
+
self.outputSize = self.config.getIntConfig("train.output.size")[0]
|
| 110 |
+
self.numIter = self.config.getIntConfig("train.num.iterations")[0]
|
| 111 |
+
optimizer = self.config.getStringConfig("train.optimizer")[0]
|
| 112 |
+
self.lossFnStr = self.config.getStringConfig("train.lossFn")[0]
|
| 113 |
+
self.accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
|
| 114 |
+
self.trackErr = self.config.getBooleanConfig("train.track.error")[0]
|
| 115 |
+
self.restored = False
|
| 116 |
+
self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None
|
| 117 |
+
|
| 118 |
+
#build network
|
| 119 |
+
layers = list()
|
| 120 |
+
ninp = numinp
|
| 121 |
+
trData = self.config.getStringConfig("train.layer.data")[0].split(",")
|
| 122 |
+
for ld in trData:
|
| 123 |
+
lde = ld.split(":")
|
| 124 |
+
ne = len(lde)
|
| 125 |
+
assert ne == 5 or ne == 6, "expecting 5 or 6 items for layer data"
|
| 126 |
+
|
| 127 |
+
gconv = False
|
| 128 |
+
if ne == 6:
|
| 129 |
+
if lde[0] == "gconv":
|
| 130 |
+
gconv == True
|
| 131 |
+
lde = lde[1:]
|
| 132 |
+
|
| 133 |
+
#num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction
|
| 134 |
+
nunit = int(lde[0])
|
| 135 |
+
actStr = lde[1]
|
| 136 |
+
act = FeedForwardNetwork.createActivation(actStr) if actStr != "none" else None
|
| 137 |
+
bnorm = lde[2] == "true"
|
| 138 |
+
afterAct = lde[3] == "true"
|
| 139 |
+
dpr = float(lde[4])
|
| 140 |
+
|
| 141 |
+
if gconv:
|
| 142 |
+
layers.append(GCNConv(ninp, nunit))
|
| 143 |
+
else:
|
| 144 |
+
layers.append(Linear(ninp, nunit))
|
| 145 |
+
if bnorm:
|
| 146 |
+
#with batch norm
|
| 147 |
+
if afterAct:
|
| 148 |
+
safeAppend(layers, act)
|
| 149 |
+
layers.append(torch.nn.BatchNorm1d(nunit))
|
| 150 |
+
else:
|
| 151 |
+
layers.append(torch.nn.BatchNorm1d(nunit))
|
| 152 |
+
safeAppend(layers, act)
|
| 153 |
+
else:
|
| 154 |
+
#without batch norm
|
| 155 |
+
safeAppend(layers, act)
|
| 156 |
+
|
| 157 |
+
if dpr > 0:
|
| 158 |
+
layers.append(torch.nn.Dropout(dpr))
|
| 159 |
+
ninp = nunit
|
| 160 |
+
|
| 161 |
+
self.layers = torch.nn.ModuleList(layers)
|
| 162 |
+
self.device = FeedForwardNetwork.getDevice(self)
|
| 163 |
+
self.to(self.device)
|
| 164 |
+
self.loadData()
|
| 165 |
+
|
| 166 |
+
self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)
|
| 167 |
+
self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizer)
|
| 168 |
+
self.trained = False
|
| 169 |
+
|
| 170 |
+
def loadData(self):
|
| 171 |
+
"""
|
| 172 |
+
load node and edge data
|
| 173 |
+
"""
|
| 174 |
+
dataFilePath = self.config.getStringConfig("train.data.file")[0]
|
| 175 |
+
numNodes = self.config.getIntConfig("train.data.num.nodes.total")[0]
|
| 176 |
+
numLabeled = self.config.getIntConfig("train.data.num.nodes.training")[0]
|
| 177 |
+
splits = self.config.getFloatListConfig("train.data.splits")[0]
|
| 178 |
+
crPredMask = self.config.getBooleanConfig("predict.create.mask")[0]
|
| 179 |
+
|
| 180 |
+
dx = list()
|
| 181 |
+
dy = list()
|
| 182 |
+
edges = list()
|
| 183 |
+
mask = None
|
| 184 |
+
for rec in fileRecGen(dataFilePath, ","):
|
| 185 |
+
if len(rec) > 2:
|
| 186 |
+
x = rec[1 :-1]
|
| 187 |
+
x = toFloatList(x)
|
| 188 |
+
y = int(rec[-1])
|
| 189 |
+
dx.append(x)
|
| 190 |
+
dy.append(y)
|
| 191 |
+
elif len(rec) == 2:
|
| 192 |
+
e = toIntList(rec)
|
| 193 |
+
edges.append(e)
|
| 194 |
+
elif len(rec) == 1:
|
| 195 |
+
items = rec[0].split()
|
| 196 |
+
assertEqual(items[0], "mask", "invalid mask data")
|
| 197 |
+
numNodes = int(items[1])
|
| 198 |
+
print(numNodes)
|
| 199 |
+
mask = list()
|
| 200 |
+
for r in range(2, len(items), 1):
|
| 201 |
+
ri = items[r].split(":")
|
| 202 |
+
#print(ri)
|
| 203 |
+
ms = list(range(int(ri[0]), int(ri[1]), 1))
|
| 204 |
+
mask.extend(ms)
|
| 205 |
+
#scale node features
|
| 206 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
| 207 |
+
scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
|
| 208 |
+
dx = scaleData(dx, scalingMethod)
|
| 209 |
+
|
| 210 |
+
dx = torch.tensor(dx, dtype=torch.float)
|
| 211 |
+
dy = torch.tensor(dy, dtype=torch.long)
|
| 212 |
+
edges = torch.tensor(edges, dtype=torch.long)
|
| 213 |
+
edges = edges.t().contiguous()
|
| 214 |
+
dx = dx.to(self.device)
|
| 215 |
+
dy = dy.to(self.device)
|
| 216 |
+
edges = edges.to(self.device)
|
| 217 |
+
self.data = Data(x=dx, edge_index=edges, y=dy)
|
| 218 |
+
|
| 219 |
+
#maks
|
| 220 |
+
if mask is None:
|
| 221 |
+
#trainiug data in the beginning
|
| 222 |
+
trStart = 0
|
| 223 |
+
vaStart = int(splits[0] * numLabeled)
|
| 224 |
+
teStart = vaStart + int(splits[1] * numLabeled)
|
| 225 |
+
|
| 226 |
+
trMask = [False] * numNodes
|
| 227 |
+
trMask[0:vaStart] = [True] * vaStart
|
| 228 |
+
vaMask = [False] * numNodes
|
| 229 |
+
vaMask[vaStart:teStart] = [True] * (teStart - vaStart)
|
| 230 |
+
teMask = [False] * numNodes
|
| 231 |
+
teMask[teStart:] = [True] * (numNodes - teStart)
|
| 232 |
+
else:
|
| 233 |
+
#training data anywhere
|
| 234 |
+
if crPredMask:
|
| 235 |
+
prMask = [True] * numNodes
|
| 236 |
+
for i in mask:
|
| 237 |
+
prMask[i] = False
|
| 238 |
+
self.prMask = torch.tensor(prMask, dtype=torch.bool)
|
| 239 |
+
|
| 240 |
+
nshuffle = int(len(mask) / 2)
|
| 241 |
+
shuffle(mask, nshuffle)
|
| 242 |
+
#print(mask)
|
| 243 |
+
lmask = len(mask)
|
| 244 |
+
trme = int(splits[0] * lmask)
|
| 245 |
+
vame = int((splits[0] + splits[1]) * lmask)
|
| 246 |
+
teme = lmask
|
| 247 |
+
trMask = [False] * numNodes
|
| 248 |
+
for i in mask[:trme]:
|
| 249 |
+
trMask[i] = True
|
| 250 |
+
vaMask = [False] * numNodes
|
| 251 |
+
for i in mask[trme:vame]:
|
| 252 |
+
vaMask[i] = True
|
| 253 |
+
teMask = [False] * numNodes
|
| 254 |
+
for i in mask[vame:]:
|
| 255 |
+
teMask[i] = True
|
| 256 |
+
#print(vaMask)
|
| 257 |
+
|
| 258 |
+
trMask = torch.tensor(trMask, dtype=torch.bool)
|
| 259 |
+
trMask = trMask.to(self.device)
|
| 260 |
+
self.data.train_mask = trMask
|
| 261 |
+
vaMask = torch.tensor(vaMask, dtype=torch.bool)
|
| 262 |
+
vaMask = vaMask.to(self.device)
|
| 263 |
+
self.data.val_mask = vaMask
|
| 264 |
+
teMask = torch.tensor(teMask, dtype=torch.bool)
|
| 265 |
+
teMask = teMask.to(self.device)
|
| 266 |
+
self.data.test_mask = teMask
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
def descData(self):
|
| 270 |
+
"""
|
| 271 |
+
describe data
|
| 272 |
+
"""
|
| 273 |
+
print(f'Number of nodes: {self.data.num_nodes}')
|
| 274 |
+
print(f'Number of edges: {self.data.num_edges}')
|
| 275 |
+
print(f'Number of node features: {self.data.num_node_features}')
|
| 276 |
+
print(f'Number of training nodes: {self.data.train_mask.sum()}')
|
| 277 |
+
print(f'Training node label rate: {int(self.data.train_mask.sum()) / data.num_nodes:.2f}')
|
| 278 |
+
print(f'Number of validation nodes: {self.data.val_mask.sum()}')
|
| 279 |
+
print(f'Number of test nodes: {self.data.test_mask.sum()}')
|
| 280 |
+
print(f'Is undirected: {self.data.is_undirected()}')
|
| 281 |
+
|
| 282 |
+
print("Data attributes")
|
| 283 |
+
print(self.data.keys)
|
| 284 |
+
|
| 285 |
+
print("Data types")
|
| 286 |
+
print(type(self.data.x))
|
| 287 |
+
print(type(self.data.y))
|
| 288 |
+
print(type(self.data.edge_index))
|
| 289 |
+
print(type(self.data.train_mask))
|
| 290 |
+
|
| 291 |
+
print("Sample data")
|
| 292 |
+
print("x", self.data.x[:4])
|
| 293 |
+
print("y", self.data.y[:4])
|
| 294 |
+
print("edge", self.data.edge_index[:4])
|
| 295 |
+
print("train mask", self.data.train_mask[:4])
|
| 296 |
+
print("test mask", self.data.test_mask[:4])
|
| 297 |
+
|
| 298 |
+
print("Any isolated node? " , self.data.has_isolated_nodes())
|
| 299 |
+
print("Any self loop? ", self.data.has_self_loops())
|
| 300 |
+
print("Is graph directed? ", self.data.is_directed())
|
| 301 |
+
|
| 302 |
+
def forward(self):
|
| 303 |
+
"""
|
| 304 |
+
forward prop
|
| 305 |
+
"""
|
| 306 |
+
x, edges = self.data.x, self.data.edge_index
|
| 307 |
+
for l in self.layers:
|
| 308 |
+
if isinstance(l, MessagePassing):
|
| 309 |
+
x = l(x, edges)
|
| 310 |
+
else:
|
| 311 |
+
x = l(x)
|
| 312 |
+
return x
|
| 313 |
+
|
| 314 |
+
@staticmethod
|
| 315 |
+
def trainModel(model):
|
| 316 |
+
"""
|
| 317 |
+
train with batch data
|
| 318 |
+
|
| 319 |
+
Parameters
|
| 320 |
+
model : torch model
|
| 321 |
+
"""
|
| 322 |
+
epochIntv = model.config.getIntConfig("train.epoch.intv")[0]
|
| 323 |
+
|
| 324 |
+
model.train()
|
| 325 |
+
if model.trackErr:
|
| 326 |
+
trErr = list()
|
| 327 |
+
vaErr = list()
|
| 328 |
+
|
| 329 |
+
for epoch in range(model.numIter):
|
| 330 |
+
out = model()
|
| 331 |
+
loss = model.lossFn(out[model.data.train_mask], model.data.y[model.data.train_mask])
|
| 332 |
+
|
| 333 |
+
#error tracking at batch level
|
| 334 |
+
if model.trackErr:
|
| 335 |
+
trErr.append(loss.item())
|
| 336 |
+
vErr = GraphConvoNetwork.evaluateModel(model)
|
| 337 |
+
vaErr.append(vErr)
|
| 338 |
+
if model.verbose and epoch % epochIntv == 0:
|
| 339 |
+
print("epoch {} loss {:.6f} val error {:.6f}".format(epoch, loss.item(), vErr))
|
| 340 |
+
|
| 341 |
+
model.optimizer.zero_grad()
|
| 342 |
+
loss.backward()
|
| 343 |
+
model.optimizer.step()
|
| 344 |
+
|
| 345 |
+
#acc = GraphConvoNetwork.evaluateModel(model, True)
|
| 346 |
+
#print(acc)
|
| 347 |
+
modelSave = model.config.getBooleanConfig("train.model.save")[0]
|
| 348 |
+
if modelSave:
|
| 349 |
+
FeedForwardNetwork.saveCheckpt(model)
|
| 350 |
+
|
| 351 |
+
if model.trackErr:
|
| 352 |
+
FeedForwardNetwork.errorPlot(model, trErr, vaErr)
|
| 353 |
+
|
| 354 |
+
model.trained = True
|
| 355 |
+
|
| 356 |
+
@staticmethod
|
| 357 |
+
def evaluateModel(model, verbose=False):
|
| 358 |
+
"""
|
| 359 |
+
evaluate model
|
| 360 |
+
|
| 361 |
+
Parameters
|
| 362 |
+
model : torch model
|
| 363 |
+
verbose : if True additional output
|
| 364 |
+
"""
|
| 365 |
+
model.eval()
|
| 366 |
+
with torch.no_grad():
|
| 367 |
+
out = model()
|
| 368 |
+
if verbose:
|
| 369 |
+
print(out)
|
| 370 |
+
yPred = out[model.data.val_mask].data.cpu().numpy()
|
| 371 |
+
yActual = model.data.y[model.data.val_mask].data.cpu().numpy()
|
| 372 |
+
if verbose:
|
| 373 |
+
for pa in zip(yPred, yActual):
|
| 374 |
+
print(pa)
|
| 375 |
+
#correct = yPred == yActual
|
| 376 |
+
#score = int(correct.sum()) / int(model.data.val_mask.sum())
|
| 377 |
+
|
| 378 |
+
score = perfMetric(model.lossFnStr, yActual, yPred, model.clabels)
|
| 379 |
+
|
| 380 |
+
model.train()
|
| 381 |
+
return score
|
| 382 |
+
|
| 383 |
+
@staticmethod
|
| 384 |
+
def validateModel(model, retPred=False):
|
| 385 |
+
"""
|
| 386 |
+
model validation
|
| 387 |
+
|
| 388 |
+
Parameters
|
| 389 |
+
model : torch model
|
| 390 |
+
retPred : if True return prediction
|
| 391 |
+
"""
|
| 392 |
+
model.eval()
|
| 393 |
+
with torch.no_grad():
|
| 394 |
+
out = model()
|
| 395 |
+
yPred = out.argmax(dim=1)
|
| 396 |
+
yPred = yPred[model.data.test_mask].data.cpu().numpy()
|
| 397 |
+
yActual = model.data.y[model.data.test_mask].data.cpu().numpy()
|
| 398 |
+
#correct = yPred == yActual
|
| 399 |
+
#score = int(correct.sum()) / int(model.data.val_mask.sum())
|
| 400 |
+
score = perfMetric(model.accMetric, yActual, yPred)
|
| 401 |
+
print(formatFloat(3, score, "test #perf score"))
|
| 402 |
+
return score
|
| 403 |
+
|
| 404 |
+
@staticmethod
|
| 405 |
+
def modelPrediction(model, inclData=True):
|
| 406 |
+
"""
|
| 407 |
+
make prediction
|
| 408 |
+
|
| 409 |
+
Parameters
|
| 410 |
+
model : torch model
|
| 411 |
+
inclData : True to include input data
|
| 412 |
+
"""
|
| 413 |
+
cmask = model.config.getBooleanConfig("predict.create.mask")[0]
|
| 414 |
+
if not cmask:
|
| 415 |
+
print("create prediction mask property needs to be set to True")
|
| 416 |
+
return None
|
| 417 |
+
|
| 418 |
+
useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
|
| 419 |
+
if useSavedModel:
|
| 420 |
+
FeedForwardNetwork.restoreCheckpt(model)
|
| 421 |
+
else:
|
| 422 |
+
if not model.trained:
|
| 423 |
+
GraphConvoNetwork.trainModel(model)
|
| 424 |
+
|
| 425 |
+
model.eval()
|
| 426 |
+
with torch.no_grad():
|
| 427 |
+
out = model()
|
| 428 |
+
yPred = out.argmax(dim=1)
|
| 429 |
+
yPred = yPred[model.prMask].data.cpu().numpy()
|
| 430 |
+
|
| 431 |
+
if inclData:
|
| 432 |
+
dataFilePath = model.config.getStringConfig("train.data.file")[0]
|
| 433 |
+
filt = lambda r : len(r) > 2
|
| 434 |
+
ndata = list(fileFiltRecGen(dataFilePath, filt))
|
| 435 |
+
prMask = model.prMask.data.cpu().numpy()
|
| 436 |
+
assertEqual(len(ndata), prMask.shape[0], "data and mask lengths are not equal")
|
| 437 |
+
precs = list(compress(ndata, prMask))
|
| 438 |
+
precs = list(map(lambda r : r[:-1], precs))
|
| 439 |
+
assertEqual(len(precs), yPred.shape[0], "data and mask lengths are not equal")
|
| 440 |
+
res = zip(precs, yPred)
|
| 441 |
+
else:
|
| 442 |
+
res = yPred
|
| 443 |
+
return res
|
| 444 |
+
|
supv/knn.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import numpy as np
|
| 23 |
+
import sklearn as sk
|
| 24 |
+
import matplotlib
|
| 25 |
+
import random
|
| 26 |
+
import jprops
|
| 27 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 28 |
+
from random import randint
|
| 29 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 30 |
+
from util import *
|
| 31 |
+
from mlutil import *
|
| 32 |
+
from bacl import *
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# gradient boosting classification
|
| 36 |
+
class NearestNeighbor(BaseClassifier):
|
| 37 |
+
def __init__(self, configFile):
|
| 38 |
+
defValues = {}
|
| 39 |
+
defValues["common.mode"] = ("training", None)
|
| 40 |
+
defValues["common.model.directory"] = ("model", None)
|
| 41 |
+
defValues["common.model.file"] = (None, None)
|
| 42 |
+
defValues["common.preprocessing"] = (None, None)
|
| 43 |
+
defValues["common.scaling.method"] = ("zscale", None)
|
| 44 |
+
defValues["common.verbose"] = (False, None)
|
| 45 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
| 46 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
| 47 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
| 48 |
+
defValues["train.data.class.field"] = (None, "missing class field ordinal")
|
| 49 |
+
defValues["train.num.neighbors"] = (5, None)
|
| 50 |
+
defValues["train.neighbor.weight"] = ("uniform", None)
|
| 51 |
+
defValues["train.neighbor.search.algo"] = ("auto", None)
|
| 52 |
+
defValues["train.neighbor.search.leaf.size"] = (10, None)
|
| 53 |
+
defValues["train.neighbor.dist.metric"] = ("minkowski", None)
|
| 54 |
+
defValues["train.neighbor.dist.metric.pow"] = (2.0, None)
|
| 55 |
+
defValues["train.success.criterion"] = ("error", None)
|
| 56 |
+
defValues["train.model.save"] = (False, None)
|
| 57 |
+
defValues["train.score.method"] = ("accuracy", None)
|
| 58 |
+
defValues["predict.data.file"] = (None, None)
|
| 59 |
+
defValues["predict.data.fields"] = (None, "missing data field ordinals")
|
| 60 |
+
defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
|
| 61 |
+
defValues["predict.use.saved.model"] = (False, None)
|
| 62 |
+
|
| 63 |
+
super(NearestNeighbor, self).__init__(configFile, defValues, __name__)
|
| 64 |
+
|
| 65 |
+
def buildModel(self):
|
| 66 |
+
"""
|
| 67 |
+
builds model object
|
| 68 |
+
"""
|
| 69 |
+
self.logger.info("...building knn classifer model")
|
| 70 |
+
numNeighbors = self.config.getIntConfig("train.num.neighbors")[0]
|
| 71 |
+
neighborWeight = self.config.getStringConfig("train.neighbor.weight")[0]
|
| 72 |
+
searchAlgo = self.config.getStringConfig("train.neighbor.search.algo")[0]
|
| 73 |
+
leafSize = self.config.getIntConfig("train.neighbor.search.leaf.size")[0]
|
| 74 |
+
distMetric = self.config.getStringConfig("train.neighbor.dist.metric")[0]
|
| 75 |
+
metricPow = self.config.getIntConfig("train.neighbor.dist.metric.pow")[0]
|
| 76 |
+
|
| 77 |
+
model = KNeighborsClassifier(n_neighbors=numNeighbors, weights=neighborWeight, algorithm=searchAlgo,
|
| 78 |
+
leaf_size=30, p=metricPow, metric=distMetric)
|
| 79 |
+
self.classifier = model
|
| 80 |
+
return self.classifier
|
| 81 |
+
|
| 82 |
+
def predictProb(self, recs=None):
|
| 83 |
+
"""
|
| 84 |
+
predict probability
|
| 85 |
+
"""
|
| 86 |
+
# create model
|
| 87 |
+
self.prepModel()
|
| 88 |
+
|
| 89 |
+
#input record
|
| 90 |
+
if recs is None:
|
| 91 |
+
featData = self.prepPredictData()
|
| 92 |
+
else:
|
| 93 |
+
if type(recs) is str:
|
| 94 |
+
featData = self.prepStringPredictData(recs)
|
| 95 |
+
else:
|
| 96 |
+
featData = recs
|
| 97 |
+
if (featData.ndim == 1):
|
| 98 |
+
featData = featData.reshape(1, -1)
|
| 99 |
+
|
| 100 |
+
#predict
|
| 101 |
+
self.logger.info("...predicting class probability")
|
| 102 |
+
clsData = self.classifier.predict_proba(featData)
|
| 103 |
+
return clsData
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
|
supv/lrd.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import numpy as np
|
| 23 |
+
import sklearn as sk
|
| 24 |
+
import sklearn.linear_model
|
| 25 |
+
import matplotlib
|
| 26 |
+
import random
|
| 27 |
+
import jprops
|
| 28 |
+
from sklearn.linear_model import LogisticRegression
|
| 29 |
+
from random import randint
|
| 30 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 31 |
+
from util import *
|
| 32 |
+
from mlutil import *
|
| 33 |
+
from pasearch import *
|
| 34 |
+
from bacl import *
|
| 35 |
+
|
| 36 |
+
# logistic regression classification
|
| 37 |
+
class LogisticRegressionDiscriminant(BaseClassifier):
|
| 38 |
+
|
| 39 |
+
def __init__(self, configFile):
|
| 40 |
+
defValues = {}
|
| 41 |
+
defValues["common.mode"] = ("train", None)
|
| 42 |
+
defValues["common.model.directory"] = ("model", None)
|
| 43 |
+
defValues["common.model.file"] = (None, None)
|
| 44 |
+
defValues["common.scale.file.path"] = (None, "missing scale file path")
|
| 45 |
+
defValues["common.preprocessing"] = (None, None)
|
| 46 |
+
defValues["common.verbose"] = (False, None)
|
| 47 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
| 48 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
| 49 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
| 50 |
+
defValues["train.data.class.field"] = (None, "missing class field ordinal")
|
| 51 |
+
defValues["train.validation"] = ("kfold", None)
|
| 52 |
+
defValues["train.num.folds"] = (5, None)
|
| 53 |
+
defValues["train.penalty"] = ("l2", None)
|
| 54 |
+
defValues["train.dual"] = (False, None)
|
| 55 |
+
defValues["train.tolerance"] = (0.0001, None)
|
| 56 |
+
defValues["train.regularization"] = (1.0, None)
|
| 57 |
+
defValues["train.fit.intercept"] = (True, None)
|
| 58 |
+
defValues["train.intercept.scaling"] = (1.0, None)
|
| 59 |
+
defValues["train.class.weight"] = (None, None)
|
| 60 |
+
defValues["train.random.state"] = (None, None)
|
| 61 |
+
defValues["train.solver"] = ("liblinear", None)
|
| 62 |
+
defValues["train.max.iter"] = (100, None)
|
| 63 |
+
defValues["train.multi.class"] = ("ovr", None)
|
| 64 |
+
defValues["train.verbose"] = (0, None)
|
| 65 |
+
defValues["train.warm.start"] = (False, None)
|
| 66 |
+
defValues["train.num.jobs"] = (None, None)
|
| 67 |
+
defValues["train.l1.ratio"] = (None, None)
|
| 68 |
+
defValues["train.success.criterion"] = ("error", None)
|
| 69 |
+
defValues["train.model.save"] = (False, None)
|
| 70 |
+
defValues["train.score.method"] = ("accuracy", None)
|
| 71 |
+
defValues["train.search.param.strategy"] = (None, None)
|
| 72 |
+
defValues["train.search.params"] = (None, None)
|
| 73 |
+
defValues["predict.data.file"] = (None, None)
|
| 74 |
+
defValues["predict.data.fields"] = (None, "missing data field ordinals")
|
| 75 |
+
defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
|
| 76 |
+
defValues["predict.use.saved.model"] = (False, None)
|
| 77 |
+
defValues["validate.data.file"] = (None, "missing validation data file")
|
| 78 |
+
defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
|
| 79 |
+
defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
|
| 80 |
+
defValues["validate.data.class.field"] = (None, "missing class field ordinal")
|
| 81 |
+
defValues["validate.use.saved.model"] = (False, None)
|
| 82 |
+
defValues["validate.score.method"] = ("accuracy", None)
|
| 83 |
+
|
| 84 |
+
super(LogisticRegressionDiscriminant, self).__init__(configFile, defValues, __name__)
|
| 85 |
+
|
| 86 |
+
# builds model object
|
| 87 |
+
def buildModel(self):
|
| 88 |
+
print ("...building logistic regression model")
|
| 89 |
+
penalty = self.config.getStringConfig("train.penalty")[0]
|
| 90 |
+
dual = self.config.getBooleanConfig("train.dual")[0]
|
| 91 |
+
tol = self.config.getFloatConfig("train.tolerance")[0]
|
| 92 |
+
c = self.config.getFloatConfig("train.regularization")[0]
|
| 93 |
+
fitIntercept = self.config.getBooleanConfig("train.fit.intercept")[0]
|
| 94 |
+
interceptScaling = self.config.getFloatConfig("train.intercept.scaling")[0]
|
| 95 |
+
classWeight = self.config.getStringConfig("train.class.weight")[0]
|
| 96 |
+
randomState = self.config.getIntConfig("train.random.state")[0]
|
| 97 |
+
solver = self.config.getStringConfig("train.solver")[0]
|
| 98 |
+
maxIter = self.config.getIntConfig("train.max.iter")[0]
|
| 99 |
+
multiClass = self.config.getStringConfig("train.multi.class")[0]
|
| 100 |
+
verbos = self.config.getIntConfig("train.verbose")[0]
|
| 101 |
+
warmStart = self.config.getBooleanConfig("train.warm.start")[0]
|
| 102 |
+
nJobs = self.config.getIntConfig("train.num.jobs")[0]
|
| 103 |
+
l1Ratio = self.config.getFloatConfig("train.l1.ratio")[0]
|
| 104 |
+
|
| 105 |
+
self.classifier = LogisticRegression(penalty=penalty, dual=dual, tol=tol, C=c, fit_intercept=fitIntercept,\
|
| 106 |
+
intercept_scaling=interceptScaling, class_weight=classWeight, random_state=randomState, solver=solver,\
|
| 107 |
+
max_iter=maxIter, multi_class=multiClass, verbose=verbos, warm_start=warmStart, n_jobs=nJobs, l1_ratio=l1Ratio)
|
| 108 |
+
|
| 109 |
+
return self.classifier
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
|
supv/lstm.py
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import numpy as np
|
| 23 |
+
import torch
|
| 24 |
+
from torch import nn
|
| 25 |
+
from torch.autograd import Variable
|
| 26 |
+
from torch.utils.data import DataLoader
|
| 27 |
+
from torchvision import transforms
|
| 28 |
+
import sklearn as sk
|
| 29 |
+
import matplotlib
|
| 30 |
+
import random
|
| 31 |
+
import jprops
|
| 32 |
+
from random import randint
|
| 33 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 34 |
+
from util import *
|
| 35 |
+
from mlutil import *
|
| 36 |
+
from tnn import FeedForwardNetwork
|
| 37 |
+
|
| 38 |
+
"""
|
| 39 |
+
LSTM with one or more hidden layers with multi domensional data
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
class LstmNetwork(nn.Module):
|
| 43 |
+
def __init__(self, configFile):
|
| 44 |
+
"""
|
| 45 |
+
In the constructor we instantiate two nn.Linear modules and assign them as
|
| 46 |
+
member variables.
|
| 47 |
+
|
| 48 |
+
Parameters
|
| 49 |
+
configFile : config file path
|
| 50 |
+
"""
|
| 51 |
+
defValues = dict()
|
| 52 |
+
defValues["common.mode"] = ("training", None)
|
| 53 |
+
defValues["common.model.directory"] = ("model", None)
|
| 54 |
+
defValues["common.model.file"] = (None, None)
|
| 55 |
+
defValues["common.preprocessing"] = (None, None)
|
| 56 |
+
defValues["common.scaling.method"] = ("zscale", None)
|
| 57 |
+
defValues["common.scaling.minrows"] = (50, None)
|
| 58 |
+
defValues["common.verbose"] = (False, None)
|
| 59 |
+
defValues["common.device"] = ("cpu", None)
|
| 60 |
+
defValues["train.data.file"] = (None, "missing training data file path")
|
| 61 |
+
defValues["train.data.type"] = ("numeric", None)
|
| 62 |
+
defValues["train.data.feat.cols"] = (None, "missing feature columns")
|
| 63 |
+
defValues["train.data.target.col"] = (None, "missing target column")
|
| 64 |
+
defValues["train.data.delim"] = (",", None)
|
| 65 |
+
defValues["train.input.size"] = (None, "missing input size")
|
| 66 |
+
defValues["train.hidden.size"] = (None, "missing hidden size")
|
| 67 |
+
defValues["train.output.size"] = (None, "missing output size")
|
| 68 |
+
defValues["train.num.layers"] = (1, None)
|
| 69 |
+
defValues["train.seq.len"] = (1, None)
|
| 70 |
+
defValues["train.batch.size"] = (32, None)
|
| 71 |
+
defValues["train.batch.first"] = (False, None)
|
| 72 |
+
defValues["train.drop.prob"] = (0, None)
|
| 73 |
+
defValues["train.optimizer"] = ("adam", None)
|
| 74 |
+
defValues["train.opt.learning.rate"] = (.0001, None)
|
| 75 |
+
defValues["train.opt.weight.decay"] = (0, None)
|
| 76 |
+
defValues["train.opt.momentum"] = (0, None)
|
| 77 |
+
defValues["train.opt.eps"] = (1e-08, None)
|
| 78 |
+
defValues["train.opt.dampening"] = (0, None)
|
| 79 |
+
defValues["train.opt.momentum.nesterov"] = (False, None)
|
| 80 |
+
defValues["train.opt.betas"] = ([0.9, 0.999], None)
|
| 81 |
+
defValues["train.opt.alpha"] = (0.99, None)
|
| 82 |
+
defValues["train.out.sequence"] = (True, None)
|
| 83 |
+
defValues["train.out.activation"] = ("sigmoid", None)
|
| 84 |
+
defValues["train.loss.fn"] = ("mse", None)
|
| 85 |
+
defValues["train.loss.reduction"] = ("mean", None)
|
| 86 |
+
defValues["train.grad.clip"] = (5, None)
|
| 87 |
+
defValues["train.num.iterations"] = (500, None)
|
| 88 |
+
defValues["train.save.model"] = (False, None)
|
| 89 |
+
defValues["valid.data.file"] = (None, "missing validation data file path")
|
| 90 |
+
defValues["valid.accuracy.metric"] = (None, None)
|
| 91 |
+
defValues["predict.data.file"] = (None, None)
|
| 92 |
+
defValues["predict.use.saved.model"] = (True, None)
|
| 93 |
+
defValues["predict.output"] = ("binary", None)
|
| 94 |
+
defValues["predict.feat.pad.size"] = (60, None)
|
| 95 |
+
|
| 96 |
+
self.config = Configuration(configFile, defValues)
|
| 97 |
+
|
| 98 |
+
super(LstmNetwork, self).__init__()
|
| 99 |
+
|
| 100 |
+
def getConfig(self):
|
| 101 |
+
return self.config
|
| 102 |
+
|
| 103 |
+
def buildModel(self):
|
| 104 |
+
"""
|
| 105 |
+
Loads configuration and builds the various piecess necessary for the model
|
| 106 |
+
"""
|
| 107 |
+
torch.manual_seed(9999)
|
| 108 |
+
self.verbose = self.config.getStringConfig("common.verbose")[0]
|
| 109 |
+
self.inputSize = self.config.getIntConfig("train.input.size")[0]
|
| 110 |
+
self.outputSize = self.config.getIntConfig("train.output.size")[0]
|
| 111 |
+
self.nLayers = self.config.getIntConfig("train.num.layers")[0]
|
| 112 |
+
self.hiddenSize = self.config.getIntConfig("train.hidden.size")[0]
|
| 113 |
+
self.seqLen = self.config.getIntConfig("train.seq.len")[0]
|
| 114 |
+
self.batchSize = self.config.getIntConfig("train.batch.size")[0]
|
| 115 |
+
self.batchFirst = self.config.getBooleanConfig("train.batch.first")[0]
|
| 116 |
+
dropProb = self.config.getFloatConfig("train.drop.prob")[0]
|
| 117 |
+
self.outSeq = self.config.getBooleanConfig("train.out.sequence")[0]
|
| 118 |
+
self.device = FeedForwardNetwork.getDevice(self)
|
| 119 |
+
|
| 120 |
+
#model
|
| 121 |
+
self.lstm = nn.LSTM(self.inputSize, self.hiddenSize, self.nLayers, dropout=dropProb, batch_first=self.batchFirst)
|
| 122 |
+
self.linear = nn.Linear(self.hiddenSize, self.outputSize)
|
| 123 |
+
outAct = self.config.getStringConfig("train.out.activation")[0]
|
| 124 |
+
self.outAct = FeedForwardNetwork.createActivation(outAct)
|
| 125 |
+
|
| 126 |
+
#load training data
|
| 127 |
+
dataFilePath = self.config.getStringConfig("train.data.file")[0]
|
| 128 |
+
self.fCols = self.config.getIntListConfig("train.data.feat.cols")[0]
|
| 129 |
+
assert len(self.fCols) == 2, "specify only start and end columns of features"
|
| 130 |
+
self.tCol = self.config.getIntConfig("train.data.target.col")[0]
|
| 131 |
+
self.delim = self.config.getStringConfig("train.data.delim")[0]
|
| 132 |
+
|
| 133 |
+
self.fData, self.tData = self.loadData(dataFilePath, self.delim, self.fCols[0],self.fCols[1], self.tCol)
|
| 134 |
+
self.fData = torch.from_numpy(self.fData)
|
| 135 |
+
self.fData = self.fData.to(self.device)
|
| 136 |
+
self.tData = torch.from_numpy(self.tData)
|
| 137 |
+
self.tData = self.tData.to(self.device)
|
| 138 |
+
|
| 139 |
+
#load validation data
|
| 140 |
+
vaDataFilePath = self.config.getStringConfig("valid.data.file")[0]
|
| 141 |
+
self.vfData, self.vtData = self.loadData(vaDataFilePath, self.delim, self.fCols[0], self.fCols[1], self.tCol)
|
| 142 |
+
self.vfData = torch.from_numpy(self.vfData)
|
| 143 |
+
self.vfData = self.vfData.to(self.device)
|
| 144 |
+
self.vtData = torch.from_numpy(self.vtData)
|
| 145 |
+
self.vtData = self.vtData.to(self.device)
|
| 146 |
+
|
| 147 |
+
self.batchSize = self.config.getIntConfig("train.batch.size")[0]
|
| 148 |
+
self.dataSize = self.fData.shape[0]
|
| 149 |
+
self.numBatch = int(self.dataSize / self.batchSize)
|
| 150 |
+
self.restored = False
|
| 151 |
+
|
| 152 |
+
self.to(self.device)
|
| 153 |
+
|
| 154 |
+
def loadData(self, filePath, delim, scolStart, scolEnd, targetCol):
|
| 155 |
+
"""
|
| 156 |
+
loads data for file with one sequence per line and data can be a vector
|
| 157 |
+
|
| 158 |
+
Parameters
|
| 159 |
+
filePath : file path
|
| 160 |
+
delim : field delemeter
|
| 161 |
+
scolStart : seq column start index
|
| 162 |
+
scolEnd : seq column end index
|
| 163 |
+
targetCol : target field col index
|
| 164 |
+
"""
|
| 165 |
+
if targetCol >= 0:
|
| 166 |
+
#include target column
|
| 167 |
+
cols = list(range(scolStart, scolEnd + 1, 1))
|
| 168 |
+
cols.append(targetCol)
|
| 169 |
+
data = np.loadtxt(filePath, delimiter=delim, usecols=cols)
|
| 170 |
+
#one output for whole sequence
|
| 171 |
+
sData = data[:, :-1]
|
| 172 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
| 173 |
+
sData = self.scaleSeqData(sData)
|
| 174 |
+
tData = data[:, -1]
|
| 175 |
+
|
| 176 |
+
#target int (index into class labels) for classification
|
| 177 |
+
sData = sData.astype(np.float32)
|
| 178 |
+
tData = tData.astype(np.float32) if self.outputSize == 1 else tData.astype(np.long)
|
| 179 |
+
exData = (sData, tData)
|
| 180 |
+
else:
|
| 181 |
+
#exclude target column
|
| 182 |
+
cols = list(range(scolStart, scolEnd + 1, 1))
|
| 183 |
+
data = np.loadtxt(filePath, delimiter=delim, usecols=cols)
|
| 184 |
+
|
| 185 |
+
#one output for whole sequence
|
| 186 |
+
sData = data
|
| 187 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
| 188 |
+
sData = self.scaleSeqData(sData)
|
| 189 |
+
|
| 190 |
+
#target int (index into class labels) for classification
|
| 191 |
+
sData = sData.astype(np.float32)
|
| 192 |
+
exData = sData
|
| 193 |
+
|
| 194 |
+
return exData
|
| 195 |
+
|
| 196 |
+
def scaleSeqData(self, sData):
|
| 197 |
+
"""
|
| 198 |
+
scales data transforming non squence format
|
| 199 |
+
|
| 200 |
+
Parameters
|
| 201 |
+
sData : sequence data
|
| 202 |
+
"""
|
| 203 |
+
scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
|
| 204 |
+
sData = fromMultDimSeqToTabular(sData, self.inputSize, self.seqLen)
|
| 205 |
+
sData = scaleData(sData, scalingMethod)
|
| 206 |
+
sData = fromTabularToMultDimSeq(sData, self.inputSize, self.seqLen)
|
| 207 |
+
return sData
|
| 208 |
+
|
| 209 |
+
def formattedBatchGenarator(self):
|
| 210 |
+
"""
|
| 211 |
+
transforms traing data from (dataSize, seqLength x inputSize) to (batch, seqLength, inputSize) tensor
|
| 212 |
+
or (seqLength, batch, inputSize) tensor
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
for _ in range(self.numBatch):
|
| 216 |
+
bfData = torch.zeros([self.batchSize, self.seqLen, self.inputSize], dtype=torch.float32) if self.batchFirst\
|
| 217 |
+
else torch.zeros([self.seqLen, self.batchSize, self.inputSize], dtype=torch.float32)
|
| 218 |
+
tdType = torch.float32 if self.outputSize == 1 else torch.long
|
| 219 |
+
btData = torch.zeros([self.batchSize], dtype=tdType)
|
| 220 |
+
|
| 221 |
+
i = 0
|
| 222 |
+
for bdi in range(self.batchSize):
|
| 223 |
+
di = sampleUniform(0, self.dataSize-1)
|
| 224 |
+
row = self.fData[di]
|
| 225 |
+
for ci, cv in enumerate(row):
|
| 226 |
+
si = int(ci / self.inputSize)
|
| 227 |
+
ii = ci % self.inputSize
|
| 228 |
+
if self.batchFirst:
|
| 229 |
+
bfData[bdi][si][ii] = cv
|
| 230 |
+
else:
|
| 231 |
+
#print(si, bdi, ii)
|
| 232 |
+
bfData[si][bdi][ii] = cv
|
| 233 |
+
btData[i] = self.tData[di]
|
| 234 |
+
i += 1
|
| 235 |
+
|
| 236 |
+
#for seq output correct first 2 dimensions
|
| 237 |
+
if self.outSeq and not self.batchFirst:
|
| 238 |
+
btData = torch.transpose(btData,0,1)
|
| 239 |
+
|
| 240 |
+
yield (bfData, btData)
|
| 241 |
+
|
| 242 |
+
def formatData(self, fData, tData=None):
|
| 243 |
+
"""
|
| 244 |
+
transforms validation or prediction data data from (dataSize, seqLength x inputSize) to
|
| 245 |
+
(batch, seqLength, inputSize) tensor or (seqLength, batch, inputSize) tensor
|
| 246 |
+
|
| 247 |
+
Parameters
|
| 248 |
+
fData : feature data
|
| 249 |
+
tData : target data
|
| 250 |
+
"""
|
| 251 |
+
dSize = fData.shape[0]
|
| 252 |
+
bfData = torch.zeros([dSize, self.seqLen, self.inputSize], dtype=torch.float32) if self.batchFirst\
|
| 253 |
+
else torch.zeros([self.seqLen, dSize, self.inputSize], dtype=torch.float32)
|
| 254 |
+
|
| 255 |
+
for ri in range(dSize):
|
| 256 |
+
row = fData[ri]
|
| 257 |
+
for ci, cv in enumerate(row):
|
| 258 |
+
si = int(ci / self.inputSize)
|
| 259 |
+
ii = ci % self.inputSize
|
| 260 |
+
if self.batchFirst:
|
| 261 |
+
bfData[ri][si][ii] = cv
|
| 262 |
+
else:
|
| 263 |
+
bfData[si][ri][ii] = cv
|
| 264 |
+
if tData is not None:
|
| 265 |
+
btData = torch.transpose(tData,0,1) if self.outSeq and not self.batchFirst else tData
|
| 266 |
+
formData = (bfData, btData)
|
| 267 |
+
else:
|
| 268 |
+
formData = bfData
|
| 269 |
+
return formData
|
| 270 |
+
|
| 271 |
+
def forward(self, x, h):
|
| 272 |
+
"""
|
| 273 |
+
Forward pass
|
| 274 |
+
|
| 275 |
+
Parameters
|
| 276 |
+
x : input data
|
| 277 |
+
h : targhiddenet state
|
| 278 |
+
"""
|
| 279 |
+
out, hout = self.lstm(x,h)
|
| 280 |
+
if self.outSeq:
|
| 281 |
+
# seq to seq prediction
|
| 282 |
+
out = out.view(-1, self.hiddenSize)
|
| 283 |
+
out = self.linear(out)
|
| 284 |
+
if self.outAct is not None:
|
| 285 |
+
out = self.outAct(out)
|
| 286 |
+
out = out.view(self.batchSize * self.seqLen, -1)
|
| 287 |
+
else:
|
| 288 |
+
#seq to one prediction
|
| 289 |
+
out = out[self.seqLen - 1].view(-1, self.hiddenSize)
|
| 290 |
+
out = self.linear(out)
|
| 291 |
+
if self.outAct is not None:
|
| 292 |
+
out = self.outAct(out)
|
| 293 |
+
#out = out.view(self.batchSize, -1)
|
| 294 |
+
|
| 295 |
+
return out, hout
|
| 296 |
+
|
| 297 |
+
def initHidden(self, batch):
|
| 298 |
+
"""
|
| 299 |
+
Initialize hidden weights
|
| 300 |
+
|
| 301 |
+
Parameters
|
| 302 |
+
batch : batch size
|
| 303 |
+
"""
|
| 304 |
+
hidden = (torch.zeros(self.nLayers,batch,self.hiddenSize),
|
| 305 |
+
torch.zeros(self.nLayers,batch,self.hiddenSize))
|
| 306 |
+
return hidden
|
| 307 |
+
|
| 308 |
+
def trainLstm(self):
|
| 309 |
+
"""
|
| 310 |
+
train lstm
|
| 311 |
+
"""
|
| 312 |
+
print("..starting training")
|
| 313 |
+
self.train()
|
| 314 |
+
|
| 315 |
+
#device = self.config.getStringConfig("common.device")[0]
|
| 316 |
+
#self.to(device)
|
| 317 |
+
optimizerName = self.config.getStringConfig("train.optimizer")[0]
|
| 318 |
+
self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizerName)
|
| 319 |
+
lossFn = self.config.getStringConfig("train.loss.fn")[0]
|
| 320 |
+
criterion = FeedForwardNetwork.createLossFunction(self, lossFn)
|
| 321 |
+
clip = self.config.getFloatConfig("train.grad.clip")[0]
|
| 322 |
+
numIter = self.config.getIntConfig("train.num.iterations")[0]
|
| 323 |
+
accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
for it in range(numIter):
|
| 327 |
+
b = 0
|
| 328 |
+
for inputs, labels in self.formattedBatchGenarator():
|
| 329 |
+
#forward pass
|
| 330 |
+
hid = self.initHidden(self.batchSize)
|
| 331 |
+
hid = (hid[0].to(self.device), hid[1].to(self.device))
|
| 332 |
+
inputs, labels = inputs.to(self.device), labels.to(self.device)
|
| 333 |
+
output, hid = self(inputs, hid)
|
| 334 |
+
|
| 335 |
+
#loss
|
| 336 |
+
if self.outSeq:
|
| 337 |
+
labels = labels.view(self.batchSize * self.seqLen, -1)
|
| 338 |
+
loss = criterion(output, labels)
|
| 339 |
+
|
| 340 |
+
if self.verbose and it % 50 == 0 and b % 10 == 0:
|
| 341 |
+
print("epoch {} batch {} loss {:.6f}".format(it, b, loss.item()))
|
| 342 |
+
|
| 343 |
+
# zero gradients, perform a backward pass, and update the weights.
|
| 344 |
+
self.optimizer.zero_grad()
|
| 345 |
+
loss.backward()
|
| 346 |
+
nn.utils.clip_grad_norm_(self.parameters(), clip)
|
| 347 |
+
self.optimizer.step()
|
| 348 |
+
b += 1
|
| 349 |
+
|
| 350 |
+
#validate
|
| 351 |
+
print("..validating model")
|
| 352 |
+
self.eval()
|
| 353 |
+
with torch.no_grad():
|
| 354 |
+
fData, tData = self.formatData(self.vfData, self.vtData)
|
| 355 |
+
fData = fData.to(self.device)
|
| 356 |
+
vsize = tData.shape[0]
|
| 357 |
+
hid = self.initHidden(vsize)
|
| 358 |
+
hid = (hid[0].to(self.device), hid[1].to(self.device))
|
| 359 |
+
yPred, _ = self(fData, hid)
|
| 360 |
+
yPred = yPred.data.cpu().numpy()
|
| 361 |
+
yActual = tData.data.cpu().numpy()
|
| 362 |
+
|
| 363 |
+
if self.verbose:
|
| 364 |
+
print("\npredicted \t\t actual")
|
| 365 |
+
for i in range(vsize):
|
| 366 |
+
print(str(yPred[i]) + "\t" + str(yActual[i]))
|
| 367 |
+
|
| 368 |
+
score = perfMetric(accMetric, yActual, yPred)
|
| 369 |
+
print(formatFloat(3, score, "perf score"))
|
| 370 |
+
|
| 371 |
+
#save
|
| 372 |
+
modelSave = self.config.getBooleanConfig("train.model.save")[0]
|
| 373 |
+
if modelSave:
|
| 374 |
+
FeedForwardNetwork.saveCheckpt(self)
|
| 375 |
+
|
| 376 |
+
def predictLstm(self):
|
| 377 |
+
"""
|
| 378 |
+
predict
|
| 379 |
+
"""
|
| 380 |
+
print("..predicting using model")
|
| 381 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
| 382 |
+
if useSavedModel:
|
| 383 |
+
FeedForwardNetwork.restoreCheckpt(self)
|
| 384 |
+
else:
|
| 385 |
+
self.trainLstm()
|
| 386 |
+
|
| 387 |
+
prDataFilePath = self.config.getStringConfig("predict.data.file")[0]
|
| 388 |
+
pfData = self.loadData(prDataFilePath, self.delim, self.fCols[0], self.fCols[1], -1)
|
| 389 |
+
pfData = torch.from_numpy(pfData)
|
| 390 |
+
dsize = pfData.shape[0]
|
| 391 |
+
|
| 392 |
+
#predict
|
| 393 |
+
#device = self.config.getStringConfig("common.device")[0]
|
| 394 |
+
self.eval()
|
| 395 |
+
with torch.no_grad():
|
| 396 |
+
fData = self.formatData(pfData)
|
| 397 |
+
fData = fData.to(self.device)
|
| 398 |
+
hid = self.initHidden(dsize)
|
| 399 |
+
hid = (hid[0].to(self.device), hid[1].to(self.device))
|
| 400 |
+
yPred, _ = self(fData, hid)
|
| 401 |
+
yPred = yPred.data.cpu().numpy()
|
| 402 |
+
|
| 403 |
+
if self.outputSize == 2:
|
| 404 |
+
#classification
|
| 405 |
+
yPred = FeedForwardNetwork.processClassifOutput(yPred, self.config)
|
| 406 |
+
|
| 407 |
+
# print prediction
|
| 408 |
+
FeedForwardNetwork.printPrediction(yPred, self.config, prDataFilePath)
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
|
supv/mcalib.py
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import numpy as np
|
| 23 |
+
import sklearn as sk
|
| 24 |
+
from sklearn.neighbors import KDTree
|
| 25 |
+
import matplotlib
|
| 26 |
+
import random
|
| 27 |
+
import jprops
|
| 28 |
+
from random import randint
|
| 29 |
+
import statistics
|
| 30 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 31 |
+
from util import *
|
| 32 |
+
from mlutil import *
|
| 33 |
+
from tnn import *
|
| 34 |
+
from stats import *
|
| 35 |
+
|
| 36 |
+
"""
|
| 37 |
+
neural model calibration
|
| 38 |
+
"""
|
| 39 |
+
class ModelCalibration(object):
|
| 40 |
+
def __init__(self):
|
| 41 |
+
pass
|
| 42 |
+
|
| 43 |
+
@staticmethod
|
| 44 |
+
def findModelCalibration(model):
|
| 45 |
+
"""
|
| 46 |
+
pmodel calibration
|
| 47 |
+
"""
|
| 48 |
+
FeedForwardNetwork.prepValidate(model)
|
| 49 |
+
FeedForwardNetwork.validateModel(model)
|
| 50 |
+
|
| 51 |
+
yPred = model.yPred.flatten()
|
| 52 |
+
yActual = model.validOutData.flatten()
|
| 53 |
+
nsamp = len(yActual)
|
| 54 |
+
|
| 55 |
+
#print(yPred.shape)
|
| 56 |
+
#print(yActual.shape)
|
| 57 |
+
|
| 58 |
+
nBins = model.config.getIntConfig("calibrate.num.bins")[0]
|
| 59 |
+
prThreshhold = model.config.getFloatConfig("calibrate.pred.prob.thresh")[0]
|
| 60 |
+
|
| 61 |
+
minConf = yPred.min()
|
| 62 |
+
maxConf = yPred.max()
|
| 63 |
+
bsize = (maxConf - minConf) / nBins
|
| 64 |
+
#print("minConf {:.3f} maxConf {:.3f} bsize {:.3f}".format(minConf, maxConf, bsize))
|
| 65 |
+
blist = list(map(lambda i : None, range(nBins)))
|
| 66 |
+
|
| 67 |
+
#binning
|
| 68 |
+
for yp, ya in zip(yPred, yActual):
|
| 69 |
+
indx = int((yp - minConf) / bsize)
|
| 70 |
+
if indx == nBins:
|
| 71 |
+
indx = nBins - 1
|
| 72 |
+
#print("yp {:.3f} indx {}".format(yp, indx))
|
| 73 |
+
pair = (yp, ya)
|
| 74 |
+
plist = blist[indx]
|
| 75 |
+
if plist is None:
|
| 76 |
+
plist = list()
|
| 77 |
+
blist[indx] = plist
|
| 78 |
+
plist.append(pair)
|
| 79 |
+
|
| 80 |
+
x = list()
|
| 81 |
+
y = list()
|
| 82 |
+
yideal = list()
|
| 83 |
+
ece = 0
|
| 84 |
+
mce = 0
|
| 85 |
+
|
| 86 |
+
# per bin confidence and accuracy
|
| 87 |
+
b = 0
|
| 88 |
+
for plist in blist:
|
| 89 |
+
if plist is not None:
|
| 90 |
+
#confidence
|
| 91 |
+
ypl = list(map(lambda p : p[0], plist))
|
| 92 |
+
ypm = statistics.mean(ypl)
|
| 93 |
+
x.append(ypm)
|
| 94 |
+
|
| 95 |
+
#accuracy
|
| 96 |
+
ypcount = 0
|
| 97 |
+
for p in plist:
|
| 98 |
+
yp = 1 if p[0] > prThreshhold else 0
|
| 99 |
+
if (yp == 1 and p[1] == 1):
|
| 100 |
+
ypcount += 1
|
| 101 |
+
|
| 102 |
+
acc = ypcount / len(plist)
|
| 103 |
+
y.append(acc)
|
| 104 |
+
yideal.append(ypm)
|
| 105 |
+
|
| 106 |
+
ce = abs(ypm - acc)
|
| 107 |
+
ece += len(plist) * ce
|
| 108 |
+
if ce > mce:
|
| 109 |
+
mce = ce
|
| 110 |
+
else:
|
| 111 |
+
ypm = minConf + (b + 0.5) * bsize
|
| 112 |
+
x.append(ypm)
|
| 113 |
+
yideal.append(ypm)
|
| 114 |
+
y.append(0)
|
| 115 |
+
b += 1
|
| 116 |
+
|
| 117 |
+
#calibration plot
|
| 118 |
+
drawPairPlot(x, y, yideal, "confidence", "accuracy", "actual", "ideal")
|
| 119 |
+
|
| 120 |
+
print("confidence\taccuracy")
|
| 121 |
+
for z in zip(x,y):
|
| 122 |
+
print("{:.3f}\t{:.3f}".format(z[0], z[1]))
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
#expected calibration error
|
| 126 |
+
ece /= nsamp
|
| 127 |
+
print("expected calibration error\t{:.3f}".format(ece))
|
| 128 |
+
print("maximum calibration error\t{:.3f}".format(mce))
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
@staticmethod
|
| 132 |
+
def findModelCalibrationLocal(model):
|
| 133 |
+
"""
|
| 134 |
+
pmodel calibration based k nearest neghbors
|
| 135 |
+
"""
|
| 136 |
+
FeedForwardNetwork.prepValidate(model)
|
| 137 |
+
FeedForwardNetwork.validateModel(model)
|
| 138 |
+
|
| 139 |
+
yPred = model.yPred.flatten()
|
| 140 |
+
yActual = model.validOutData.flatten()
|
| 141 |
+
nsamp = len(yActual)
|
| 142 |
+
|
| 143 |
+
neighborCnt = model.config.getIntConfig("calibrate.num.nearest.neighbors")[0]
|
| 144 |
+
prThreshhold = model.config.getFloatConfig("calibrate.pred.prob.thresh")[0]
|
| 145 |
+
fData = model.validFeatData.numpy()
|
| 146 |
+
tree = KDTree(fData, leaf_size=4)
|
| 147 |
+
|
| 148 |
+
dist, ind = tree.query(fData, k=neighborCnt)
|
| 149 |
+
calibs = list()
|
| 150 |
+
#all data
|
| 151 |
+
for si, ni in enumerate(ind):
|
| 152 |
+
conf = 0
|
| 153 |
+
ypcount = 0
|
| 154 |
+
#all neighbors
|
| 155 |
+
for i in ni:
|
| 156 |
+
conf += yPred[i]
|
| 157 |
+
yp = 1 if yPred[i] > prThreshhold else 0
|
| 158 |
+
if (yp == 1 and yActual[i] == 1):
|
| 159 |
+
ypcount += 1
|
| 160 |
+
conf /= neighborCnt
|
| 161 |
+
acc = ypcount / neighborCnt
|
| 162 |
+
calib = (si, conf, acc)
|
| 163 |
+
calibs.append(calib)
|
| 164 |
+
|
| 165 |
+
#descending sort by difference between confidence and accuracy
|
| 166 |
+
calibs = sorted(calibs, key=lambda c : abs(c[1] - c[2]), reverse=True)
|
| 167 |
+
print("local calibration")
|
| 168 |
+
print("conf\taccu\trecord")
|
| 169 |
+
for i in range(19):
|
| 170 |
+
si, conf, acc = calibs[i]
|
| 171 |
+
rec = toStrFromList(fData[si], 3)
|
| 172 |
+
print("{:.3f}\t{:.3f}\t{}".format(conf, acc, rec))
|
| 173 |
+
|
| 174 |
+
@staticmethod
|
| 175 |
+
def findModelSharpness(model):
|
| 176 |
+
"""
|
| 177 |
+
pmodel calibration
|
| 178 |
+
"""
|
| 179 |
+
FeedForwardNetwork.prepValidate(model)
|
| 180 |
+
FeedForwardNetwork.validateModel(model)
|
| 181 |
+
|
| 182 |
+
yPred = model.yPred.flatten()
|
| 183 |
+
yActual = model.validOutData.flatten()
|
| 184 |
+
nsamp = len(yActual)
|
| 185 |
+
|
| 186 |
+
#print(yPred.shape)
|
| 187 |
+
#print(yActual.shape)
|
| 188 |
+
|
| 189 |
+
nBins = model.config.getIntConfig("calibrate.num.bins")[0]
|
| 190 |
+
prThreshhold = model.config.getFloatConfig("calibrate.pred.prob.thresh")[0]
|
| 191 |
+
|
| 192 |
+
minConf = yPred.min()
|
| 193 |
+
maxConf = yPred.max()
|
| 194 |
+
bsize = (maxConf - minConf) / nBins
|
| 195 |
+
#print("minConf {:.3f} maxConf {:.3f} bsize {:.3f}".format(minConf, maxConf, bsize))
|
| 196 |
+
blist = list(map(lambda i : None, range(nBins)))
|
| 197 |
+
|
| 198 |
+
#binning
|
| 199 |
+
for yp, ya in zip(yPred, yActual):
|
| 200 |
+
indx = int((yp - minConf) / bsize)
|
| 201 |
+
if indx == nBins:
|
| 202 |
+
indx = nBins - 1
|
| 203 |
+
#print("yp {:.3f} indx {}".format(yp, indx))
|
| 204 |
+
pair = (yp, ya)
|
| 205 |
+
plist = blist[indx]
|
| 206 |
+
if plist is None:
|
| 207 |
+
plist = list()
|
| 208 |
+
blist[indx] = plist
|
| 209 |
+
plist.append(pair)
|
| 210 |
+
|
| 211 |
+
y = list()
|
| 212 |
+
ypgcount = 0
|
| 213 |
+
# per bin confidence and accuracy
|
| 214 |
+
for plist in blist:
|
| 215 |
+
#ypl = list(map(lambda p : p[0], plist))
|
| 216 |
+
#ypm = statistics.mean(ypl)
|
| 217 |
+
#x.append(ypm)
|
| 218 |
+
|
| 219 |
+
ypcount = 0
|
| 220 |
+
for p in plist:
|
| 221 |
+
yp = 1 if p[0] > prThreshhold else 0
|
| 222 |
+
if (yp == 1 and p[1] == 1):
|
| 223 |
+
ypcount += 1
|
| 224 |
+
ypgcount += 1
|
| 225 |
+
|
| 226 |
+
acc = ypcount / len(plist)
|
| 227 |
+
y.append(acc)
|
| 228 |
+
|
| 229 |
+
print("{} {}".format(ypgcount, nsamp))
|
| 230 |
+
accg = ypgcount / nsamp
|
| 231 |
+
accgl = [accg] * nBins
|
| 232 |
+
x = list(range(nBins))
|
| 233 |
+
drawPairPlot(x, y, accgl, "discretized confidence", "accuracy", "local", "global")
|
| 234 |
+
|
| 235 |
+
contrast = list(map(lambda acc : abs(acc - accg), y))
|
| 236 |
+
contrast = statistics.mean(contrast)
|
| 237 |
+
print("contrast {:.3f}".format(contrast))
|
| 238 |
+
|
| 239 |
+
"""
|
| 240 |
+
neural model robustness
|
| 241 |
+
"""
|
| 242 |
+
class ModelRobustness(object):
|
| 243 |
+
def __init__(self):
|
| 244 |
+
pass
|
| 245 |
+
|
| 246 |
+
def localPerformance(self, model, fpath, nsamp, neighborCnt):
|
| 247 |
+
"""
|
| 248 |
+
local performnance sampling
|
| 249 |
+
"""
|
| 250 |
+
|
| 251 |
+
#load data
|
| 252 |
+
fData, oData = FeedForwardNetwork.prepData(model, fpath)
|
| 253 |
+
#print(type(fData))
|
| 254 |
+
#print(type(oData))
|
| 255 |
+
#print(fData.shape)
|
| 256 |
+
dsize = fData.shape[0]
|
| 257 |
+
ncol = fData.shape[1]
|
| 258 |
+
|
| 259 |
+
#kdd
|
| 260 |
+
tree = KDTree(fData, leaf_size=4)
|
| 261 |
+
|
| 262 |
+
scores = list()
|
| 263 |
+
indices = list()
|
| 264 |
+
for _ in range(nsamp):
|
| 265 |
+
indx = randomInt(0, dsize - 1)
|
| 266 |
+
indices.append(indx)
|
| 267 |
+
frow = fData[indx]
|
| 268 |
+
frow = np.reshape(frow, (1, ncol))
|
| 269 |
+
dist, ind = tree.query(frow, k=neighborCnt)
|
| 270 |
+
|
| 271 |
+
ind = ind[0]
|
| 272 |
+
vfData = fData[ind]
|
| 273 |
+
voData = oData[ind]
|
| 274 |
+
|
| 275 |
+
#print(type(vfData))
|
| 276 |
+
#print(vfData.shape)
|
| 277 |
+
#print(type(voData))
|
| 278 |
+
#print(voData.shape)
|
| 279 |
+
|
| 280 |
+
model.setValidationData((vfData, voData), False)
|
| 281 |
+
score = FeedForwardNetwork.validateModel(model)
|
| 282 |
+
scores.append(score)
|
| 283 |
+
|
| 284 |
+
#performance distribution
|
| 285 |
+
m, s = basicStat(scores)
|
| 286 |
+
print("model performance: mean {:.3f}\tstd dev {:.3f}".format(m,s))
|
| 287 |
+
drawHist(scores, "model accuracy", "accuracy", "frequency")
|
| 288 |
+
|
| 289 |
+
#worst performance
|
| 290 |
+
lscores = sorted(zip(indices, scores), key=lambda s : s[1])
|
| 291 |
+
print(lscores[:5])
|
| 292 |
+
|
| 293 |
+
lines = getFileLines(fpath, None)
|
| 294 |
+
print("worst performing features regions")
|
| 295 |
+
for i,s in lscores[:5]:
|
| 296 |
+
print("score {:.3f}\t{}".format(s, lines[i]))
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
"""
|
| 300 |
+
conformal prediction for regression
|
| 301 |
+
"""
|
| 302 |
+
class ConformalRegressionPrediction(object):
|
| 303 |
+
def __init__(self):
|
| 304 |
+
self.calibration = dict()
|
| 305 |
+
|
| 306 |
+
def calibrate(self, ypair, confBound):
|
| 307 |
+
""" n
|
| 308 |
+
calibration for conformal prediction
|
| 309 |
+
"""
|
| 310 |
+
cscores = list()
|
| 311 |
+
ymax = None
|
| 312 |
+
ymin = None
|
| 313 |
+
for yp, ya in ypair:
|
| 314 |
+
cscore = abs(yp - ya)
|
| 315 |
+
cscores.append(cscore)
|
| 316 |
+
if ymax is None:
|
| 317 |
+
ymax = ya
|
| 318 |
+
ymin = ya
|
| 319 |
+
else:
|
| 320 |
+
ymax = ya if ya > ymax else ymax
|
| 321 |
+
ymin = ya if ya < ymin else ymin
|
| 322 |
+
|
| 323 |
+
cscores.sort()
|
| 324 |
+
drawHist(cscores, "conformal score distribution", "conformal score", "frequency", 20)
|
| 325 |
+
cbi = int(confBound * len(cscores))
|
| 326 |
+
scoreConfBound = cscores[cbi]
|
| 327 |
+
self.calibration["scoreConfBound"] = scoreConfBound
|
| 328 |
+
self.calibration["ymin"] = ymin
|
| 329 |
+
self.calibration["ymax"] = ymax
|
| 330 |
+
print(self.calibration)
|
| 331 |
+
|
| 332 |
+
def saveCalib(self, fPath):
|
| 333 |
+
"""
|
| 334 |
+
saves scoformal score calibration
|
| 335 |
+
"""
|
| 336 |
+
saveObject(self.calibration, fPath)
|
| 337 |
+
|
| 338 |
+
def restoreCalib(self, fPath):
|
| 339 |
+
"""
|
| 340 |
+
saves scoformal score calibration
|
| 341 |
+
"""
|
| 342 |
+
self.calibration = restoreObject(fPath)
|
| 343 |
+
print(self.calibration)
|
| 344 |
+
|
| 345 |
+
def getPredRange(self, yp, nstep=100):
|
| 346 |
+
"""
|
| 347 |
+
get prediction range and related data
|
| 348 |
+
"""
|
| 349 |
+
ymin = self.calibration["ymin"]
|
| 350 |
+
ymax = self.calibration["ymax"]
|
| 351 |
+
step = (ymax - ymin) / nstep
|
| 352 |
+
scoreConfBound = self.calibration["scoreConfBound"]
|
| 353 |
+
|
| 354 |
+
rmin = None
|
| 355 |
+
rmax = None
|
| 356 |
+
rcount = 0
|
| 357 |
+
#print(ymin, ymax, step)
|
| 358 |
+
for ya in np.arange(ymin, ymax, step):
|
| 359 |
+
cscore = abs(yp - ya)
|
| 360 |
+
if cscore < scoreConfBound:
|
| 361 |
+
if rmin is None:
|
| 362 |
+
#lower bound
|
| 363 |
+
rmin = ya
|
| 364 |
+
rmax = ya
|
| 365 |
+
else:
|
| 366 |
+
#keep updating upper bound
|
| 367 |
+
rmax = ya if ya > rmax else rmax
|
| 368 |
+
rcount += 1
|
| 369 |
+
else:
|
| 370 |
+
if rmax is not None and rcount > 0:
|
| 371 |
+
#past upper bound
|
| 372 |
+
break
|
| 373 |
+
|
| 374 |
+
res = dict()
|
| 375 |
+
res["predRangeMin"] = rmin
|
| 376 |
+
res["predRangeMax"] = rmax
|
| 377 |
+
accepted = yp >= rmin and yp <= rmax
|
| 378 |
+
res["status"] = "accepted" if accepted else "rejected"
|
| 379 |
+
conf = 1.0 - (rmax - rmin) / (ymax - ymin)
|
| 380 |
+
res["confidence"] = conf
|
| 381 |
+
|
| 382 |
+
return res
|
| 383 |
+
|
| 384 |
+
|
supv/mcclf.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# Author: Pranab Ghosh
|
| 4 |
+
#
|
| 5 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 6 |
+
# may not use this file except in compliance with the License. You may
|
| 7 |
+
# obtain a copy of the License at
|
| 8 |
+
#
|
| 9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 10 |
+
#
|
| 11 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 14 |
+
# implied. See the License for the specific language governing
|
| 15 |
+
# permissions and limitations under the License.
|
| 16 |
+
|
| 17 |
+
# Package imports
|
| 18 |
+
import os
|
| 19 |
+
import sys
|
| 20 |
+
import matplotlib.pyplot as plt
|
| 21 |
+
import numpy as np
|
| 22 |
+
import random
|
| 23 |
+
import jprops
|
| 24 |
+
from random import randint
|
| 25 |
+
from matumizi.util import *
|
| 26 |
+
from matumizi.mlutil import *
|
| 27 |
+
|
| 28 |
+
"""
|
| 29 |
+
Markov chain classifier
|
| 30 |
+
"""
|
| 31 |
+
class MarkovChainClassifier():
|
| 32 |
+
def __init__(self, configFile):
|
| 33 |
+
"""
|
| 34 |
+
constructor
|
| 35 |
+
|
| 36 |
+
Parameters
|
| 37 |
+
configFile: config file path
|
| 38 |
+
"""
|
| 39 |
+
defValues = {}
|
| 40 |
+
defValues["common.model.directory"] = ("model", None)
|
| 41 |
+
defValues["common.model.file"] = (None, None)
|
| 42 |
+
defValues["common.verbose"] = (False, None)
|
| 43 |
+
defValues["common.states"] = (None, "missing state list")
|
| 44 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
| 45 |
+
defValues["train.data.class.labels"] = (["F", "T"], None)
|
| 46 |
+
defValues["train.data.key.len"] = (1, None)
|
| 47 |
+
defValues["train.model.save"] = (False, None)
|
| 48 |
+
defValues["train.score.method"] = ("accuracy", None)
|
| 49 |
+
defValues["predict.data.file"] = (None, None)
|
| 50 |
+
defValues["predict.use.saved.model"] = (True, None)
|
| 51 |
+
defValues["predict.log.odds.threshold"] = (0, None)
|
| 52 |
+
defValues["validate.data.file"] = (None, "missing validation data file")
|
| 53 |
+
defValues["validate.use.saved.model"] = (False, None)
|
| 54 |
+
defValues["valid.accuracy.metric"] = ("acc", None)
|
| 55 |
+
self.config = Configuration(configFile, defValues)
|
| 56 |
+
|
| 57 |
+
self.stTranPr = dict()
|
| 58 |
+
self.clabels = self.config.getStringListConfig("train.data.class.labels")[0]
|
| 59 |
+
self.states = self.config.getStringListConfig("common.states")[0]
|
| 60 |
+
self.nstates = len(self.states)
|
| 61 |
+
for cl in self.clabels:
|
| 62 |
+
stp = np.ones((self.nstates,self.nstates))
|
| 63 |
+
self.stTranPr[cl] = stp
|
| 64 |
+
|
| 65 |
+
def train(self):
|
| 66 |
+
"""
|
| 67 |
+
train model
|
| 68 |
+
"""
|
| 69 |
+
#state transition matrix
|
| 70 |
+
tdfPath = self.config.getStringConfig("train.data.file")[0]
|
| 71 |
+
klen = self.config.getIntConfig("train.data.key.len")[0]
|
| 72 |
+
for rec in fileRecGen(tdfPath):
|
| 73 |
+
cl = rec[klen]
|
| 74 |
+
rlen = len(rec)
|
| 75 |
+
for i in range(klen+1, rlen-1, 1):
|
| 76 |
+
fst = self.states.index(rec[i])
|
| 77 |
+
tst = self.states.index(rec[i+1])
|
| 78 |
+
self.stTranPr[cl][fst][tst] += 1
|
| 79 |
+
|
| 80 |
+
#normalize to probability
|
| 81 |
+
for cl in self.clabels:
|
| 82 |
+
stp = self.stTranPr[cl]
|
| 83 |
+
for i in range(self.nstates):
|
| 84 |
+
s = stp[i].sum()
|
| 85 |
+
r = stp[i] / s
|
| 86 |
+
stp[i] = r
|
| 87 |
+
|
| 88 |
+
#save
|
| 89 |
+
if self.config.getBooleanConfig("train.model.save")[0]:
|
| 90 |
+
mdPath = self.config.getStringConfig("common.model.directory")[0]
|
| 91 |
+
assert os.path.exists(mdPath), "model save directory does not exist"
|
| 92 |
+
mfPath = self.config.getStringConfig("common.model.file")[0]
|
| 93 |
+
mfPath = os.path.join(mdPath, mfPath)
|
| 94 |
+
|
| 95 |
+
with open(mfPath, "w") as fh:
|
| 96 |
+
for cl in self.clabels:
|
| 97 |
+
fh.write("label:" + cl +"\n")
|
| 98 |
+
stp = self.stTranPr[cl]
|
| 99 |
+
for r in stp:
|
| 100 |
+
rs = ",".join(toStrList(r, 6)) + "\n"
|
| 101 |
+
fh.write(rs)
|
| 102 |
+
|
| 103 |
+
def validate(self):
|
| 104 |
+
"""
|
| 105 |
+
validate using model
|
| 106 |
+
"""
|
| 107 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
| 108 |
+
if useSavedModel:
|
| 109 |
+
self.__restoreModel()
|
| 110 |
+
else:
|
| 111 |
+
self.train()
|
| 112 |
+
|
| 113 |
+
vdfPath = self.config.getStringConfig("validate.data.file")[0]
|
| 114 |
+
accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
|
| 115 |
+
|
| 116 |
+
yac, ypr = self.__getPrediction(vdfPath, True)
|
| 117 |
+
if type(self.clabels[0]) == str:
|
| 118 |
+
yac = self.__toIntClabel(yac)
|
| 119 |
+
ypr = self.__toIntClabel(ypr)
|
| 120 |
+
score = perfMetric(accMetric, yac, ypr)
|
| 121 |
+
print(formatFloat(3, score, "perf score"))
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def predict(self):
|
| 125 |
+
"""
|
| 126 |
+
predict using model
|
| 127 |
+
"""
|
| 128 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
| 129 |
+
if useSavedModel:
|
| 130 |
+
self.__restoreModel()
|
| 131 |
+
else:
|
| 132 |
+
self.train()
|
| 133 |
+
|
| 134 |
+
#predict
|
| 135 |
+
pdfPath = self.config.getStringConfig("predict.data.file")[0]
|
| 136 |
+
_ , ypr = self.__getPrediction(pdfPath)
|
| 137 |
+
return ypr
|
| 138 |
+
|
| 139 |
+
def __restoreModel(self):
|
| 140 |
+
"""
|
| 141 |
+
restore model
|
| 142 |
+
"""
|
| 143 |
+
mdPath = self.config.getStringConfig("common.model.directory")[0]
|
| 144 |
+
assert os.path.exists(mdPath), "model save directory does not exist"
|
| 145 |
+
mfPath = self.config.getStringConfig("common.model.file")[0]
|
| 146 |
+
mfPath = os.path.join(mdPath, mfPath)
|
| 147 |
+
stp = None
|
| 148 |
+
cl = None
|
| 149 |
+
for rec in fileRecGen(mfPath):
|
| 150 |
+
if len(rec) == 1:
|
| 151 |
+
if stp is not None:
|
| 152 |
+
stp = np.array(stp)
|
| 153 |
+
self.stTranPr[cl] = stp
|
| 154 |
+
cl = rec[0].split(":")[1]
|
| 155 |
+
stp = list()
|
| 156 |
+
else:
|
| 157 |
+
frec = asFloatList(rec)
|
| 158 |
+
stp.append(frec)
|
| 159 |
+
|
| 160 |
+
stp = np.array(stp)
|
| 161 |
+
self.stTranPr[cl] = stp
|
| 162 |
+
|
| 163 |
+
def __getPrediction(self, fpath, validate=False):
|
| 164 |
+
"""
|
| 165 |
+
get predictions
|
| 166 |
+
|
| 167 |
+
Parameters
|
| 168 |
+
fpath : data file path
|
| 169 |
+
validate: True if validation
|
| 170 |
+
"""
|
| 171 |
+
|
| 172 |
+
nc = self.clabels[0]
|
| 173 |
+
pc = self.clabels[1]
|
| 174 |
+
thold = self.config.getFloatConfig("predict.log.odds.threshold")[0]
|
| 175 |
+
klen = self.config.getIntConfig("train.data.key.len")[0]
|
| 176 |
+
offset = klen+1 if validate else klen
|
| 177 |
+
ypr = list()
|
| 178 |
+
yac = list()
|
| 179 |
+
for rec in fileRecGen(fpath):
|
| 180 |
+
lodds = 0
|
| 181 |
+
rlen = len(rec)
|
| 182 |
+
for i in range(offset, rlen-1, 1):
|
| 183 |
+
fst = self.states.index(rec[i])
|
| 184 |
+
tst = self.states.index(rec[i+1])
|
| 185 |
+
odds = self.stTranPr[pc][fst][tst] / self.stTranPr[nc][fst][tst]
|
| 186 |
+
lodds += math.log(odds)
|
| 187 |
+
prc = pc if lodds > thold else nc
|
| 188 |
+
ypr.append(prc)
|
| 189 |
+
if validate:
|
| 190 |
+
yac.append(rec[klen])
|
| 191 |
+
else:
|
| 192 |
+
recp = prc + "\t" + ",".join(rec)
|
| 193 |
+
print(recp)
|
| 194 |
+
|
| 195 |
+
re = (yac, ypr)
|
| 196 |
+
return re
|
| 197 |
+
|
| 198 |
+
def __toIntClabel(self, labels):
|
| 199 |
+
"""
|
| 200 |
+
convert string class label to int
|
| 201 |
+
|
| 202 |
+
Parameters
|
| 203 |
+
labels : class label values
|
| 204 |
+
"""
|
| 205 |
+
return list(map(lambda l : self.clabels.index(l), labels))
|
| 206 |
+
|
| 207 |
+
|
supv/nlm.py
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
import os
|
| 19 |
+
import sys
|
| 20 |
+
from random import randint
|
| 21 |
+
import random
|
| 22 |
+
import time
|
| 23 |
+
from datetime import datetime
|
| 24 |
+
import re, string, unicodedata
|
| 25 |
+
import spacy
|
| 26 |
+
import torch
|
| 27 |
+
from collections import defaultdict
|
| 28 |
+
import pickle
|
| 29 |
+
import numpy as np
|
| 30 |
+
import re
|
| 31 |
+
from sentence_transformers import CrossEncoder
|
| 32 |
+
|
| 33 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 34 |
+
from util import *
|
| 35 |
+
from mlutil import *
|
| 36 |
+
|
| 37 |
+
"""
|
| 38 |
+
neural language model
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
class NeuralLangModel(object):
|
| 42 |
+
def __init__(self):
|
| 43 |
+
"""
|
| 44 |
+
initialize
|
| 45 |
+
"""
|
| 46 |
+
self.dexts = None
|
| 47 |
+
|
| 48 |
+
def loadDocs(self, fpaths):
|
| 49 |
+
"""
|
| 50 |
+
loads documents from one file
|
| 51 |
+
"""
|
| 52 |
+
fPaths = fpaths.split(",")
|
| 53 |
+
if len(fPaths) == 1:
|
| 54 |
+
if os.path.isfile(fPaths[0]):
|
| 55 |
+
#one file
|
| 56 |
+
print("got one file from path")
|
| 57 |
+
dnames = fpaths
|
| 58 |
+
docStr = getOneFileContent(fPaths[0])
|
| 59 |
+
dtexts = [docStr]
|
| 60 |
+
else:
|
| 61 |
+
#all files under directory
|
| 62 |
+
print("got all files under directory from path")
|
| 63 |
+
dtexts, dnames = getFileContent(fPaths[0])
|
| 64 |
+
print("found following files")
|
| 65 |
+
for dt, dn in zip(dtexts, dnames):
|
| 66 |
+
print(dn + "\t" + dt[:40])
|
| 67 |
+
else:
|
| 68 |
+
#list of files
|
| 69 |
+
print("got list of files from path")
|
| 70 |
+
dnames = fpaths
|
| 71 |
+
dtexts = list(map(getOneFileContent, fpaths))
|
| 72 |
+
|
| 73 |
+
ndocs = (dtexts, dnames)
|
| 74 |
+
return ndocs
|
| 75 |
+
|
| 76 |
+
#Encoded doc
|
| 77 |
+
class EncodedDoc:
|
| 78 |
+
def __init__(self, dtext, dname, drank=None):
|
| 79 |
+
"""
|
| 80 |
+
initialize
|
| 81 |
+
"""
|
| 82 |
+
self.dtext = dtext
|
| 83 |
+
self.dname = dname
|
| 84 |
+
self.drank = drank
|
| 85 |
+
self.denc = None
|
| 86 |
+
self.score = None
|
| 87 |
+
|
| 88 |
+
def encode(self, nlp):
|
| 89 |
+
"""
|
| 90 |
+
encode
|
| 91 |
+
"""
|
| 92 |
+
self.denc = nlp(self.dtext)
|
| 93 |
+
|
| 94 |
+
#similarity at token and sentence level for BERT encoding
|
| 95 |
+
class SemanticSearch:
|
| 96 |
+
def __init__(self, docs=None):
|
| 97 |
+
"""
|
| 98 |
+
initialize
|
| 99 |
+
"""
|
| 100 |
+
print("loading BERT transformer model")
|
| 101 |
+
self.nlp = spacy.load("en_trf_bertbaseuncased_lg")
|
| 102 |
+
self.docs = docs if docs is not None else list()
|
| 103 |
+
|
| 104 |
+
def docAv(self,qu, doc):
|
| 105 |
+
"""
|
| 106 |
+
whole doc similarity
|
| 107 |
+
"""
|
| 108 |
+
return qu.similarity(doc)
|
| 109 |
+
|
| 110 |
+
def tokSimAv(self, qu, doc):
|
| 111 |
+
"""
|
| 112 |
+
token pair wise average
|
| 113 |
+
"""
|
| 114 |
+
qts = simAll(qu, doc)
|
| 115 |
+
asi = numpy.mean(qts)
|
| 116 |
+
return asi
|
| 117 |
+
|
| 118 |
+
def tokSimMed(self, qu, doc):
|
| 119 |
+
"""
|
| 120 |
+
token pair wise average
|
| 121 |
+
|
| 122 |
+
"""
|
| 123 |
+
qts = simAll(qu, doc)
|
| 124 |
+
asi = numpy.median(qts)
|
| 125 |
+
return asi
|
| 126 |
+
|
| 127 |
+
def tokSimMax(self, qu, doc):
|
| 128 |
+
"""
|
| 129 |
+
token pair wise max (tsma)
|
| 130 |
+
"""
|
| 131 |
+
qte = self. __getTensor(qu)
|
| 132 |
+
dte = self. __getTensor(doc)
|
| 133 |
+
return self.simMax(qte, dte)
|
| 134 |
+
|
| 135 |
+
def tokSimAvMax(self, qu, doc):
|
| 136 |
+
"""
|
| 137 |
+
token max then average (tsavm)
|
| 138 |
+
"""
|
| 139 |
+
qte = self. __getTensor(qu)
|
| 140 |
+
dte = self. __getTensor(doc)
|
| 141 |
+
return self.simAvMax(qte, dte)
|
| 142 |
+
|
| 143 |
+
def tokSimMaxAv(self, qu, doc):
|
| 144 |
+
"""
|
| 145 |
+
token average and then max
|
| 146 |
+
"""
|
| 147 |
+
qte = self. __getTensor(qu)
|
| 148 |
+
dte = self. __getTensor(doc)
|
| 149 |
+
return self.simMaxAv(qte, dte)
|
| 150 |
+
|
| 151 |
+
def sentSimAv(self, qu, doc):
|
| 152 |
+
"""
|
| 153 |
+
sentence wise average
|
| 154 |
+
"""
|
| 155 |
+
qse, dse = self.__sentEnc(qu, doc)
|
| 156 |
+
sims = self.simAll(qse, dse)
|
| 157 |
+
return numpy.mean(sims)
|
| 158 |
+
|
| 159 |
+
def sentSimMed(self, qu, doc):
|
| 160 |
+
"""
|
| 161 |
+
sentence wise average (ssma)
|
| 162 |
+
"""
|
| 163 |
+
qse, dse = self.__sentEnc(qu, doc)
|
| 164 |
+
sims = self.simAll(qse, dse)
|
| 165 |
+
return numpy.median(sims)
|
| 166 |
+
|
| 167 |
+
def sentSimMax(self, qu, doc):
|
| 168 |
+
"""
|
| 169 |
+
sentence wise average (ssma)
|
| 170 |
+
"""
|
| 171 |
+
qse, dse = self.__sentEnc(qu, doc)
|
| 172 |
+
sims = self.simAll(qse, dse)
|
| 173 |
+
return numpy.maximum(sims)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def sentSimAvMax(self, qu, doc):
|
| 177 |
+
"""
|
| 178 |
+
sentence max then average (tsavm)
|
| 179 |
+
"""
|
| 180 |
+
qse, dse = self.__sentEnc(qu, doc)
|
| 181 |
+
return self.simAvMax(qse, dse)
|
| 182 |
+
|
| 183 |
+
def sentSimMaxAv(self, qu, doc):
|
| 184 |
+
"""
|
| 185 |
+
sentence average and then max
|
| 186 |
+
"""
|
| 187 |
+
qse, dse = self.__sentEnc(qu, doc)
|
| 188 |
+
return self.simMaxAv(qse, dse)
|
| 189 |
+
|
| 190 |
+
def simMax(self, qte, dte):
|
| 191 |
+
"""
|
| 192 |
+
max similarity between 2 elements
|
| 193 |
+
"""
|
| 194 |
+
msi = 0
|
| 195 |
+
for qt in qte:
|
| 196 |
+
for dt in dte:
|
| 197 |
+
si = cosineSimilarity(qt, dt)
|
| 198 |
+
if not math.isnan(si) and si > msi:
|
| 199 |
+
msi = si
|
| 200 |
+
return msi
|
| 201 |
+
|
| 202 |
+
def simAvMax(self, qte, dte):
|
| 203 |
+
"""
|
| 204 |
+
max then average (tsavm)
|
| 205 |
+
"""
|
| 206 |
+
qts = list()
|
| 207 |
+
for qt in qte:
|
| 208 |
+
msi = 0
|
| 209 |
+
for dt in dte:
|
| 210 |
+
si = cosineSimilarity(qt, dt)
|
| 211 |
+
if not math.isnan(si) and si > msi:
|
| 212 |
+
msi = si
|
| 213 |
+
qts.append(msi)
|
| 214 |
+
|
| 215 |
+
amsi = numpy.mean(numpy.array(qts))
|
| 216 |
+
return amsi
|
| 217 |
+
|
| 218 |
+
def simMaxAv(self, lqe, lde):
|
| 219 |
+
"""
|
| 220 |
+
average and then max
|
| 221 |
+
"""
|
| 222 |
+
masi = 0
|
| 223 |
+
for qe in lqe:
|
| 224 |
+
qes = list()
|
| 225 |
+
for de in lde:
|
| 226 |
+
si = cosineSimilarity(qe, de)
|
| 227 |
+
if not math.isnan(si):
|
| 228 |
+
qes.append(si)
|
| 229 |
+
av = numpy.mean(numpy.array(qes))
|
| 230 |
+
if av > masi:
|
| 231 |
+
masi = av
|
| 232 |
+
return masi
|
| 233 |
+
|
| 234 |
+
def simAll(self, lqe, lde):
|
| 235 |
+
"""
|
| 236 |
+
all similarity
|
| 237 |
+
"""
|
| 238 |
+
qes = list()
|
| 239 |
+
for qe in lqe:
|
| 240 |
+
for de in lde:
|
| 241 |
+
si = cosineSimilarity(qe, de)
|
| 242 |
+
if not math.isnan(si):
|
| 243 |
+
qes.append(si)
|
| 244 |
+
return numpy.array(qes)
|
| 245 |
+
|
| 246 |
+
def __sentEnc(self, qu, doc):
|
| 247 |
+
"""
|
| 248 |
+
sentence encoding for query and doc
|
| 249 |
+
"""
|
| 250 |
+
qstr = qu._.trf_word_pieces_
|
| 251 |
+
qte = zip(qstr, qu._.trf_last_hidden_state)
|
| 252 |
+
qse = list()
|
| 253 |
+
for t, v in qte:
|
| 254 |
+
if t == "[CLS]":
|
| 255 |
+
qse.append(v)
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
dstr = doc._.trf_word_pieces_
|
| 259 |
+
dte = zip(dstr, doc._.trf_last_hidden_state)
|
| 260 |
+
dse = list()
|
| 261 |
+
for t, v in dte:
|
| 262 |
+
if t == "[CLS]":
|
| 263 |
+
dse.append(v)
|
| 264 |
+
|
| 265 |
+
enp = (numpy.array(qse), numpy.array(dse))
|
| 266 |
+
return enp
|
| 267 |
+
|
| 268 |
+
def __getTensor(self, toks):
|
| 269 |
+
"""
|
| 270 |
+
tensors from tokens
|
| 271 |
+
"""
|
| 272 |
+
return list(map(lambda t: t.tensor, toks))
|
| 273 |
+
|
| 274 |
+
def addDocs(self, docs):
|
| 275 |
+
"""
|
| 276 |
+
add named doc content
|
| 277 |
+
"""
|
| 278 |
+
self.docs.extend(docs)
|
| 279 |
+
|
| 280 |
+
def loadDocs(self, fpaths):
|
| 281 |
+
"""
|
| 282 |
+
loads documents from one file
|
| 283 |
+
"""
|
| 284 |
+
fPaths = fpaths.split(",")
|
| 285 |
+
if len(fPaths) == 1:
|
| 286 |
+
if os.path.isfile(fPaths[0]):
|
| 287 |
+
#one file
|
| 288 |
+
print("one file")
|
| 289 |
+
dnames = fpaths
|
| 290 |
+
docStr = getOneFileContent(fPaths[0])
|
| 291 |
+
dtexts = [docStr]
|
| 292 |
+
else:
|
| 293 |
+
#all files under directory
|
| 294 |
+
print("all files under directory")
|
| 295 |
+
dtexts, dnames = getFileContent(fPaths[0])
|
| 296 |
+
print("found following files")
|
| 297 |
+
for dt, dn in zip(dtexts, dnames):
|
| 298 |
+
print(dn + "\t" + dt[:40])
|
| 299 |
+
else:
|
| 300 |
+
#list of files
|
| 301 |
+
print("list of files")
|
| 302 |
+
dnames = fpaths
|
| 303 |
+
dtexts = list(map(getOneFileContent, fpaths))
|
| 304 |
+
|
| 305 |
+
docs = list(map(lambda dtext, dname : EncodedDoc(dtext, dname), zip(dtexts, dnames)))
|
| 306 |
+
self.docs.extend(docs)
|
| 307 |
+
|
| 308 |
+
def search(self, qstr, algo, gdranks=None):
|
| 309 |
+
"""
|
| 310 |
+
tensors from tokens
|
| 311 |
+
"""
|
| 312 |
+
qv = self.nlp(qstr)
|
| 313 |
+
res = list()
|
| 314 |
+
for d in self.docs:
|
| 315 |
+
dn = d.dname
|
| 316 |
+
if d.denc == None:
|
| 317 |
+
d.encode(self.nlp)
|
| 318 |
+
dv = d.denc
|
| 319 |
+
if algo == "ds":
|
| 320 |
+
si = self.docAv(qv, dv)
|
| 321 |
+
elif algo == "tsa":
|
| 322 |
+
si = self.tokSimAv(qv, dv)
|
| 323 |
+
elif algo == "tsme":
|
| 324 |
+
si = self.tokSimMed(qv, dv)
|
| 325 |
+
elif algo == "tsma":
|
| 326 |
+
si = self.tokSimMax(qv, dv)
|
| 327 |
+
elif algo == "tsavm":
|
| 328 |
+
si = self.tokSimAvMax(qv, dv)
|
| 329 |
+
elif algo == "tsmav":
|
| 330 |
+
si = self.tokSimMaxAv(qv, dv)
|
| 331 |
+
elif algo == "ssa":
|
| 332 |
+
si = self.sentSimAv(qv, dv)
|
| 333 |
+
elif algo == "ssme":
|
| 334 |
+
si = self.sentSimMed(qv, dv)
|
| 335 |
+
elif algo == "ssma":
|
| 336 |
+
si = self.sentSimMax(qv, dv)
|
| 337 |
+
elif algo == "ssavm":
|
| 338 |
+
si = self.sentSimAvMax(qv, dv)
|
| 339 |
+
elif algo == "ssmav":
|
| 340 |
+
si = self.sentSimMaxAv(qv, dv)
|
| 341 |
+
else:
|
| 342 |
+
si = -1.0
|
| 343 |
+
print("invalid semilarity algo")
|
| 344 |
+
|
| 345 |
+
#print("{} score {:.6f}".format(dn, si))
|
| 346 |
+
d.score = si
|
| 347 |
+
r = (dn, si)
|
| 348 |
+
res.append(r)
|
| 349 |
+
|
| 350 |
+
#search score for each document
|
| 351 |
+
res.sort(key=lambda r : r[1], reverse=True)
|
| 352 |
+
print("\nsorted search result")
|
| 353 |
+
print("query: {} matching algo: {}".format(qstr, algo))
|
| 354 |
+
for r in res:
|
| 355 |
+
print("{} score {:.3f}".format(r[0], r[1]))
|
| 356 |
+
|
| 357 |
+
#rank order if gold truuth rank provided
|
| 358 |
+
if gdranks is not None:
|
| 359 |
+
i = 0
|
| 360 |
+
count = 0
|
| 361 |
+
for d in gdranks:
|
| 362 |
+
while i < len(gdranks):
|
| 363 |
+
if d == res[i][0]:
|
| 364 |
+
count += 1
|
| 365 |
+
i += 1
|
| 366 |
+
break;
|
| 367 |
+
i += 1
|
| 368 |
+
ro = count / len(gdranks)
|
| 369 |
+
print("rank order {:.3f}".format(ro))
|
| 370 |
+
|
| 371 |
+
#similarity at passage or paragraph level using sbertcross encoder
|
| 372 |
+
class SemanticSimilaityCrossEnc(NeuralLangModel):
|
| 373 |
+
|
| 374 |
+
def __init__(self, docs=None):
|
| 375 |
+
self.dparas = None
|
| 376 |
+
self.scores = None
|
| 377 |
+
print("loading cross encoder")
|
| 378 |
+
self.model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2")
|
| 379 |
+
print("done loading cross encoder")
|
| 380 |
+
super(NeuralLangModel, self).__init__()
|
| 381 |
+
|
| 382 |
+
def paraSimilarity(self, dtext, fpaths, minParNl=1):
|
| 383 |
+
"""
|
| 384 |
+
returns paragarph pair similarity across 2 documents
|
| 385 |
+
"""
|
| 386 |
+
dtexts, dnames = self.loadDocs(fpaths)
|
| 387 |
+
if dtext is None:
|
| 388 |
+
assertEqual(len(dtexts), 2, "exactly 2 files needed")
|
| 389 |
+
self.dtexts = dtexts
|
| 390 |
+
else:
|
| 391 |
+
assertEqual(len(dtexts), 1, "exactly 1 file needed")
|
| 392 |
+
self.dtexts = list()
|
| 393 |
+
self.dtexts.append(dtext)
|
| 394 |
+
self.dtexts.append(dtexts[0])
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
self.dparas = list()
|
| 398 |
+
for text in self.dtexts:
|
| 399 |
+
regx = "\n+" if minParNl == 1 else "\n{2,}"
|
| 400 |
+
paras = re.split(regx, text.replace("\r\n", "\n"))
|
| 401 |
+
print("no of paras {}".format(len(paras)))
|
| 402 |
+
self.dparas.append(paras)
|
| 403 |
+
|
| 404 |
+
tinp = list()
|
| 405 |
+
for para1 in self.dparas[0]:
|
| 406 |
+
inp = list(map(lambda para2: [para1, para2], self.dparas[1]))
|
| 407 |
+
tinp.extend(inp)
|
| 408 |
+
|
| 409 |
+
print("input shape " + str(np.array(tinp).shape))
|
| 410 |
+
scores = self.model.predict(tinp)
|
| 411 |
+
print("score shape " + str(np.array(scores).shape))
|
| 412 |
+
#assertEqual(len(scores), len(self.dparas[0]) * len(self.dparas[1]), "no of scores don't match no of paragraph pairs")
|
| 413 |
+
print(scores)
|
| 414 |
+
|
| 415 |
+
i = 0
|
| 416 |
+
print("text paragraph pair wise similarity")
|
| 417 |
+
for para1 in self.dparas[0]:
|
| 418 |
+
for para2 in self.dparas[1]:
|
| 419 |
+
print("first: {}\t second: {}\t score: {:.6f}".format(para1[:20], para2[:20], scores[i]))
|
| 420 |
+
i += 1
|
| 421 |
+
|
| 422 |
+
self.scores = scores
|
| 423 |
+
|
| 424 |
+
def avMaxScore(self):
|
| 425 |
+
"""
|
| 426 |
+
"""
|
| 427 |
+
pass
|
| 428 |
+
|
| 429 |
+
def ner(text, nlp):
|
| 430 |
+
#nlp = spacy.load("en_core_web_md")
|
| 431 |
+
doc = nlp(text)
|
| 432 |
+
for ent in doc.ents:
|
| 433 |
+
print(ent.text, ent.start_char, ent.end_char, ent.label_)
|
| 434 |
+
|
supv/optunar.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import torch
|
| 22 |
+
from torch.utils.data import DataLoader
|
| 23 |
+
import random
|
| 24 |
+
import jprops
|
| 25 |
+
from random import randint
|
| 26 |
+
import optuna
|
| 27 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 28 |
+
from util import *
|
| 29 |
+
from mlutil import *
|
| 30 |
+
|
| 31 |
+
"""
|
| 32 |
+
neural network hyper paramter tuning with ptuna
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
def createTunerConfig(configFile):
|
| 36 |
+
"""
|
| 37 |
+
create tuner config pbject
|
| 38 |
+
"""
|
| 39 |
+
defValues = dict()
|
| 40 |
+
defValues["train.num.layers"] = ([2,4], None)
|
| 41 |
+
defValues["train.num.units"] = (None, "missing range of number of units")
|
| 42 |
+
defValues["train.activation"] = ("relu", None)
|
| 43 |
+
defValues["train.batch.normalize"] = (["true", "false"], None)
|
| 44 |
+
defValues["train.dropout.prob"] = ([-0.1, 0.5], None)
|
| 45 |
+
defValues["train.out.num.units"] = (None, "missing number of output units")
|
| 46 |
+
defValues["train.out.activation"] = (None, "missing output activation")
|
| 47 |
+
defValues["train.batch.size"] = ([16, 128], None)
|
| 48 |
+
defValues["train.opt.learning.rate"] = ([.0001, .005], None)
|
| 49 |
+
|
| 50 |
+
config = Configuration(configFile, defValues)
|
| 51 |
+
return config
|
| 52 |
+
|
| 53 |
+
def showStudyResults(study):
|
| 54 |
+
"""
|
| 55 |
+
shows study results
|
| 56 |
+
"""
|
| 57 |
+
print("Number of finished trials: ", len(study.trials))
|
| 58 |
+
print("Best trial:")
|
| 59 |
+
trial = study.best_trial
|
| 60 |
+
print("Value: ", trial.value)
|
| 61 |
+
print("Params: ")
|
| 62 |
+
for key, value in trial.params.items():
|
| 63 |
+
print(" {}: {}".format(key, value))
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def objective(trial, networkType, modelConfigFile, tunerConfigFile):
|
| 67 |
+
"""
|
| 68 |
+
optuna based hyperparamter tuning for neural network
|
| 69 |
+
"""
|
| 70 |
+
tConfig = createTunerConfig(tunerConfigFile)
|
| 71 |
+
|
| 72 |
+
#tuning parameters
|
| 73 |
+
nlayers = config.getIntListConfig("train.num.layers")[0]
|
| 74 |
+
nunits = config.getIntListConfig("train.num.units")[0]
|
| 75 |
+
act = config.getStringConfig("train.activation")[0]
|
| 76 |
+
dropOutRange = config.getFloatListConfig("train.dropout.prob")[0]
|
| 77 |
+
outNunits = config.getIntConfig("train.out.num.units")[0]
|
| 78 |
+
outAct = config.getStringConfig("train.out.activation")[0]
|
| 79 |
+
batchSizes = config.getIntListConfig("train.batch.size")[0]
|
| 80 |
+
learningRates = config.getFloatListConfig("train.opt.learning.rate")[0]
|
| 81 |
+
|
| 82 |
+
numLayers = trial.suggest_int("numLayers", nlayers[0], nlayers[1])
|
| 83 |
+
|
| 84 |
+
#batch normalize on for all layers or none
|
| 85 |
+
batchNormOptions = ["true", "false"]
|
| 86 |
+
batchNorm = trial.suggest_categorical("batchNorm", batchNormOptions)
|
| 87 |
+
|
| 88 |
+
layerConfig = ""
|
| 89 |
+
maxUnits = nunits[1]
|
| 90 |
+
sep = ":"
|
| 91 |
+
for i in range(nlayers):
|
| 92 |
+
if i < nlayers - 1:
|
| 93 |
+
nunit = trial.suggest_int("numUnits_l{}".format(i), nunits[0], maxUnits)
|
| 94 |
+
dropOut = trial.suggest_int("dropOut_l{}".format(i), dropOutRange[0], dropOutRange[1])
|
| 95 |
+
lconfig = [str(nunit), act, batchNorm, "true", "{:.3f}".format(dropOut)]
|
| 96 |
+
lconfig = sep.join(lconfig) + ","
|
| 97 |
+
maxUnits = nunit
|
| 98 |
+
else:
|
| 99 |
+
lconfig = [str(outNunits), outAct, "false", "false", "{:.3f}".format(-0.1)]
|
| 100 |
+
lconfig = sep.join(lconfig)
|
| 101 |
+
layerConfig = layerConfig + lconfig
|
| 102 |
+
|
| 103 |
+
batchSize = trial.suggest_int("batchSize", batchSizes[0], batchSizes[1])
|
| 104 |
+
learningRate = trial.suggest_int("learningRate", learningRates[0], learningRates[1])
|
| 105 |
+
|
| 106 |
+
#train model
|
| 107 |
+
nnModel = FeedForwardNetwork(modelConfigFile)
|
| 108 |
+
nnModel.setConfigParam("train.layer.data", layerConfig)
|
| 109 |
+
nnModel.setConfigParam("train.batch.size", batchSize)
|
| 110 |
+
nnModel.setConfigParam("train.opt.learning.rate", learningRate)
|
| 111 |
+
nnModel.buildModel()
|
| 112 |
+
score = FeedForwardNetwork.batchTrain(nnModel)
|
| 113 |
+
return score
|
| 114 |
+
|
| 115 |
+
if __name__ == "__main__":
|
| 116 |
+
assert len(sys.argv) == 5, "requires 4 command line args"
|
| 117 |
+
|
| 118 |
+
networkType = sys.argv[1]
|
| 119 |
+
modelConfigFile = sys.argv[2]
|
| 120 |
+
tunerConfigFile = sys.argv[3]
|
| 121 |
+
numTrial = int(sys.argv[4])
|
| 122 |
+
|
| 123 |
+
study = optuna.create_study()
|
| 124 |
+
study.optimize(lambda trial: objective(trial, networkType, modelConfigFile, tunerConfigFile), n_trials=numTrial)
|
| 125 |
+
|
| 126 |
+
showStudyResults(study)
|
| 127 |
+
|
supv/pasearch.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/Users/pranab/Tools/anaconda/bin/python
|
| 2 |
+
|
| 3 |
+
# Package imports
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
import numpy as np
|
| 7 |
+
import sklearn as sk
|
| 8 |
+
import random
|
| 9 |
+
import jprops
|
| 10 |
+
import abc
|
| 11 |
+
import math
|
| 12 |
+
import random
|
| 13 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 14 |
+
from util import *
|
| 15 |
+
|
| 16 |
+
#base parameter search
|
| 17 |
+
class BaseParameterSearch(object):
|
| 18 |
+
__metaclass__ = abc.ABCMeta
|
| 19 |
+
|
| 20 |
+
def __init__(self, verbose):
|
| 21 |
+
self.verbose = verbose
|
| 22 |
+
self.parameters = []
|
| 23 |
+
self.paramData = {}
|
| 24 |
+
self.currentParams = []
|
| 25 |
+
self.curIter = 0
|
| 26 |
+
self.bestSolution = None
|
| 27 |
+
|
| 28 |
+
# add param name and type
|
| 29 |
+
def addParam(self, param):
|
| 30 |
+
self.parameters.append(param)
|
| 31 |
+
|
| 32 |
+
# add param data
|
| 33 |
+
def addParamVaues(self, paramName, paramData):
|
| 34 |
+
self.paramData[paramName] = paramData
|
| 35 |
+
|
| 36 |
+
# max iterations
|
| 37 |
+
def setMaxIter(self, maxIter):
|
| 38 |
+
self.maxIter = maxIter
|
| 39 |
+
|
| 40 |
+
@abc.abstractmethod
|
| 41 |
+
def prepare(self):
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
@abc.abstractmethod
|
| 45 |
+
def nextParamValues(self):
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
@abc.abstractmethod
|
| 49 |
+
def setCost(self, cost):
|
| 50 |
+
pass
|
| 51 |
+
|
| 52 |
+
# get best solution
|
| 53 |
+
def getBestSolution(self):
|
| 54 |
+
return self.bestSolution
|
| 55 |
+
|
| 56 |
+
#enumerate through provided list of param values
|
| 57 |
+
class GuidedParameterSearch:
|
| 58 |
+
def __init__(self, verbose=False):
|
| 59 |
+
self.verbose = verbose
|
| 60 |
+
self.parameters = []
|
| 61 |
+
self.paramData = {}
|
| 62 |
+
self.paramIndexes = []
|
| 63 |
+
self.numParamValues = []
|
| 64 |
+
self.currentParams = []
|
| 65 |
+
self.bestSolution = None
|
| 66 |
+
|
| 67 |
+
# max iterations
|
| 68 |
+
def setMaxIter(self,maxIter):
|
| 69 |
+
self.maxIter = maxIter
|
| 70 |
+
|
| 71 |
+
# add param name and type
|
| 72 |
+
def addParam(self, param):
|
| 73 |
+
self.parameters.append(param)
|
| 74 |
+
|
| 75 |
+
# add param data
|
| 76 |
+
def addParamVaues(self, paramName, paramData):
|
| 77 |
+
self.paramData[paramName] = paramData
|
| 78 |
+
|
| 79 |
+
# prepare
|
| 80 |
+
def prepare(self):
|
| 81 |
+
self.numParams = len(self.parameters)
|
| 82 |
+
for i in range(self.numParams):
|
| 83 |
+
self.paramIndexes.append(0)
|
| 84 |
+
|
| 85 |
+
#number of values for each parameter
|
| 86 |
+
paramName = self.parameters[i][0]
|
| 87 |
+
self.numParamValues.append(len(self.paramData[paramName]))
|
| 88 |
+
self.curParamIndex = 0
|
| 89 |
+
|
| 90 |
+
paramValueCombList = []
|
| 91 |
+
paramValueComb = []
|
| 92 |
+
paramValueCombList.append(paramValueComb)
|
| 93 |
+
|
| 94 |
+
# all params
|
| 95 |
+
for i in range(self.numParams):
|
| 96 |
+
paramValueCombListTemp = []
|
| 97 |
+
for paramValueComb in paramValueCombList:
|
| 98 |
+
# all param values
|
| 99 |
+
for j in range(self.numParamValues[i]):
|
| 100 |
+
paramValueCombTemp = paramValueComb[:]
|
| 101 |
+
paramValueCombTemp.append(j)
|
| 102 |
+
paramValueCombListTemp.append(paramValueCombTemp)
|
| 103 |
+
paramValueCombList = paramValueCombListTemp
|
| 104 |
+
self.paramValueCombList = paramValueCombList
|
| 105 |
+
self.numParamValueComb = len(self.paramValueCombList)
|
| 106 |
+
self.curParamValueCombIndx = 0;
|
| 107 |
+
|
| 108 |
+
# next param combination
|
| 109 |
+
def nextParamValues(self):
|
| 110 |
+
retParamNameValue = None
|
| 111 |
+
if self.curParamValueCombIndx < len(self.paramValueCombList):
|
| 112 |
+
retParamNameValue = []
|
| 113 |
+
curParams = self.paramValueCombList[self.curParamValueCombIndx]
|
| 114 |
+
print (curParams)
|
| 115 |
+
for i in range(len(curParams)):
|
| 116 |
+
paramName = self.parameters[i][0]
|
| 117 |
+
paramValue = self.paramData[paramName][curParams[i]]
|
| 118 |
+
retParamNameValue.append((paramName, paramValue))
|
| 119 |
+
self.curParamValueCombIndx = self.curParamValueCombIndx + 1
|
| 120 |
+
self.currentParams = retParamNameValue
|
| 121 |
+
return retParamNameValue
|
| 122 |
+
|
| 123 |
+
# set cost of current parameter set
|
| 124 |
+
def setCost(self, cost):
|
| 125 |
+
if self.bestSolution is not None:
|
| 126 |
+
if cost < self.bestSolution[1]:
|
| 127 |
+
self.bestSolution = (self.currentParams, cost)
|
| 128 |
+
else:
|
| 129 |
+
self.bestSolution = (self.currentParams, cost)
|
| 130 |
+
|
| 131 |
+
# get best solution
|
| 132 |
+
def getBestSolution(self):
|
| 133 |
+
return self.bestSolution
|
| 134 |
+
|
| 135 |
+
#random search through provided list of parameter values
|
| 136 |
+
class RandomParameterSearch(BaseParameterSearch):
|
| 137 |
+
def __init__(self, verbose=False):
|
| 138 |
+
super(RandomParameterSearch, self).__init__(verbose)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# prepare
|
| 142 |
+
def prepare(self):
|
| 143 |
+
pass
|
| 144 |
+
|
| 145 |
+
# next param combination
|
| 146 |
+
def nextParamValues(self):
|
| 147 |
+
retParamNameValue = None
|
| 148 |
+
if (self.curIter < self.maxIter):
|
| 149 |
+
retParamNameValue = []
|
| 150 |
+
for pName, pValues in self.paramData.iteritems():
|
| 151 |
+
pValue = selectRandomFromList(pValues)
|
| 152 |
+
retParamNameValue.append((pName, pValue))
|
| 153 |
+
self.curIter = self.curIter + 1
|
| 154 |
+
self.currentParams = retParamNameValue
|
| 155 |
+
return retParamNameValue
|
| 156 |
+
|
| 157 |
+
# set cost of current parameter set
|
| 158 |
+
def setCost(self, cost):
|
| 159 |
+
if self.bestSolution is not None:
|
| 160 |
+
if cost < self.bestSolution[1]:
|
| 161 |
+
self.bestSolution = (self.currentParams, cost)
|
| 162 |
+
else:
|
| 163 |
+
self.bestSolution = (self.currentParams, cost)
|
| 164 |
+
|
| 165 |
+
#random search through provided list of parameter values
|
| 166 |
+
class SimulatedAnnealingParameterSearch(BaseParameterSearch):
|
| 167 |
+
def __init__(self, verbose=False):
|
| 168 |
+
self.curSolution = None
|
| 169 |
+
self.nextSolution = None
|
| 170 |
+
super(SimulatedAnnealingParameterSearch, self).__init__(verbose)
|
| 171 |
+
|
| 172 |
+
# prepare
|
| 173 |
+
def prepare(self):
|
| 174 |
+
pass
|
| 175 |
+
|
| 176 |
+
def setTemp(self, temp):
|
| 177 |
+
self.temp = temp
|
| 178 |
+
|
| 179 |
+
def setTempReductionRate(self, tempRedRate):
|
| 180 |
+
self.tempRedRate = tempRedRate
|
| 181 |
+
|
| 182 |
+
# next param combination
|
| 183 |
+
def nextParamValues(self):
|
| 184 |
+
retParamNameValue = None
|
| 185 |
+
if (self.curIter == 0):
|
| 186 |
+
#initial random solution
|
| 187 |
+
retParamNameValue = []
|
| 188 |
+
for pName, pValues in self.paramData.iteritems():
|
| 189 |
+
pValue = selectRandomFromList(pValues)
|
| 190 |
+
retParamNameValue.append((pName, pValue))
|
| 191 |
+
self.curIter = self.curIter + 1
|
| 192 |
+
self.currentParams = retParamNameValue
|
| 193 |
+
elif (self.curIter < self.maxIter):
|
| 194 |
+
#perturb current solution
|
| 195 |
+
retParamNameValue = []
|
| 196 |
+
|
| 197 |
+
#randomly mutate one parameter value
|
| 198 |
+
(pNameSel, pValue) = selectRandomFromList(self.currentParams)
|
| 199 |
+
pValueNext = selectRandomFromList(self.paramData[pNameSel])
|
| 200 |
+
while (pValueNext == pValue):
|
| 201 |
+
pValueNext = selectRandomFromList(self.paramData[pNameSel])
|
| 202 |
+
|
| 203 |
+
#copy
|
| 204 |
+
for (pName, pValue) in self.currentParams:
|
| 205 |
+
if (pName == pNameSel):
|
| 206 |
+
pValueNew = pValueNext
|
| 207 |
+
else:
|
| 208 |
+
pValueNew = pValue
|
| 209 |
+
retParamNameValue.append((pName, pValueNew))
|
| 210 |
+
self.curIter = self.curIter + 1
|
| 211 |
+
self.currentParams = retParamNameValue
|
| 212 |
+
return retParamNameValue
|
| 213 |
+
|
| 214 |
+
# set cost of current parameter set
|
| 215 |
+
def setCost(self, cost):
|
| 216 |
+
if self.curSolution is None:
|
| 217 |
+
self.curSolution = (self.currentParams, cost)
|
| 218 |
+
self.bestSolution = (self.currentParams, cost)
|
| 219 |
+
else:
|
| 220 |
+
self.nextSolution = (self.currentParams, cost)
|
| 221 |
+
if (self.nextSolution[1] < self.curSolution[1]):
|
| 222 |
+
if (self.verbose):
|
| 223 |
+
print ("next soln better")
|
| 224 |
+
self.curSolution = self.nextSolution
|
| 225 |
+
if (self.nextSolution[1] < self.bestSolution[1]):
|
| 226 |
+
if (self.verbose):
|
| 227 |
+
print ("next soln better than best")
|
| 228 |
+
self.bestSolution = self.nextSolution
|
| 229 |
+
else:
|
| 230 |
+
if (self.verbose):
|
| 231 |
+
print ("next soln worst")
|
| 232 |
+
pr = math.exp((self.curSolution[1] - self.nextSolution[1]) / self.temp)
|
| 233 |
+
if (pr > random.random()):
|
| 234 |
+
self.curSolution = self.nextSolution
|
| 235 |
+
if (self.verbose):
|
| 236 |
+
print ("next soln worst but accepted")
|
| 237 |
+
else:
|
| 238 |
+
if (self.verbose):
|
| 239 |
+
print ("next soln worst and rejected")
|
| 240 |
+
|
| 241 |
+
self.temp = self.temp * self.tempRedRate
|
| 242 |
+
|
| 243 |
+
|
supv/regress.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import numpy as np
|
| 23 |
+
import sklearn as sk
|
| 24 |
+
import matplotlib
|
| 25 |
+
import random
|
| 26 |
+
import jprops
|
| 27 |
+
from io import StringIO
|
| 28 |
+
from sklearn.model_selection import cross_val_score
|
| 29 |
+
import joblib
|
| 30 |
+
from random import randint
|
| 31 |
+
from io import StringIO
|
| 32 |
+
from sklearn.linear_model import LinearRegression
|
| 33 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 34 |
+
from util import *
|
| 35 |
+
from mlutil import *
|
| 36 |
+
from pasearch import *
|
| 37 |
+
|
| 38 |
+
class BaseRegressor(object):
|
| 39 |
+
"""
|
| 40 |
+
base regression class
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def __init__(self, configFile, defValues):
|
| 44 |
+
"""
|
| 45 |
+
intializer
|
| 46 |
+
"""
|
| 47 |
+
defValues["common.mode"] = ("train", None)
|
| 48 |
+
defValues["common.model.directory"] = ("model", None)
|
| 49 |
+
defValues["common.model.file"] = (None, None)
|
| 50 |
+
defValues["common.scale.file.path"] = (None, "missing scale file path")
|
| 51 |
+
defValues["common.preprocessing"] = (None, None)
|
| 52 |
+
defValues["common.verbose"] = (False, None)
|
| 53 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
| 54 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
| 55 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
| 56 |
+
defValues["train.data.out.field"] = (None, "missing out field ordinal")
|
| 57 |
+
|
| 58 |
+
self.config = Configuration(configFile, defValues)
|
| 59 |
+
self.featData = None
|
| 60 |
+
self.outData = None
|
| 61 |
+
self.regressor = None
|
| 62 |
+
self.verbose = self.config.getBooleanConfig("common.verbose")[0]
|
| 63 |
+
self.mode = self.config.getBooleanConfig("common.mode")[0]
|
| 64 |
+
logFilePath = self.config.getStringConfig("common.logging.file")[0]
|
| 65 |
+
logLevName = self.config.getStringConfig("common.logging.level")[0]
|
| 66 |
+
self.logger = createLogger(__name__, logFilePath, logLevName)
|
| 67 |
+
self.logger.info("********* starting session")
|
| 68 |
+
|
| 69 |
+
def initConfig(self, configFile, defValues):
|
| 70 |
+
"""
|
| 71 |
+
initialize config
|
| 72 |
+
"""
|
| 73 |
+
self.config = Configuration(configFile, defValues)
|
| 74 |
+
|
| 75 |
+
def getConfig(self):
|
| 76 |
+
"""
|
| 77 |
+
get config object
|
| 78 |
+
"""
|
| 79 |
+
return self.config
|
| 80 |
+
|
| 81 |
+
def setConfigParam(self, name, value):
|
| 82 |
+
"""
|
| 83 |
+
set config param
|
| 84 |
+
"""
|
| 85 |
+
self.config.setParam(name, value)
|
| 86 |
+
|
| 87 |
+
def getMode(self):
|
| 88 |
+
"""
|
| 89 |
+
get mode
|
| 90 |
+
"""
|
| 91 |
+
return self.mode
|
| 92 |
+
|
| 93 |
+
def train(self):
|
| 94 |
+
"""
|
| 95 |
+
train model
|
| 96 |
+
"""
|
| 97 |
+
#build model
|
| 98 |
+
self.buildModel()
|
| 99 |
+
|
| 100 |
+
# training data
|
| 101 |
+
if self.featData is None:
|
| 102 |
+
(featData, outData) = self.prepData("train")
|
| 103 |
+
(self.featData, self.outData) = (featData, outData)
|
| 104 |
+
else:
|
| 105 |
+
(featData, outData) = (self.featData, self.outData)
|
| 106 |
+
|
| 107 |
+
# parameters
|
| 108 |
+
modelSave = self.config.getBooleanConfig("train.model.save")[0]
|
| 109 |
+
|
| 110 |
+
#train
|
| 111 |
+
self.logger.info("...training model")
|
| 112 |
+
self.regressor.fit(featData, outData)
|
| 113 |
+
rsqScore = self.regressor.score(featData, outData)
|
| 114 |
+
coef = self.regressor.coef_
|
| 115 |
+
intc = self.regressor.intercept_
|
| 116 |
+
result = (rsqScore, intc, coef)
|
| 117 |
+
|
| 118 |
+
if modelSave:
|
| 119 |
+
self.logger.info("...saving model")
|
| 120 |
+
modelFilePath = self.getModelFilePath()
|
| 121 |
+
joblib.dump(self.regressor, modelFilePath)
|
| 122 |
+
return result
|
| 123 |
+
|
| 124 |
+
def validate(self):
|
| 125 |
+
# create model
|
| 126 |
+
self.prepModel()
|
| 127 |
+
|
| 128 |
+
# prepare test data
|
| 129 |
+
(featData, outDataActual) = self.prepData("validate")
|
| 130 |
+
|
| 131 |
+
#predict
|
| 132 |
+
self.logger.info("...predicting")
|
| 133 |
+
outDataPred = self.regressor.predict(featData)
|
| 134 |
+
|
| 135 |
+
#error
|
| 136 |
+
rsqScore = self.regressor.score(featData, outDataActual)
|
| 137 |
+
result = (outDataPred, rsqScore)
|
| 138 |
+
return result
|
| 139 |
+
|
| 140 |
+
def predict(self):
|
| 141 |
+
"""
|
| 142 |
+
predict using trained model
|
| 143 |
+
"""
|
| 144 |
+
# create model
|
| 145 |
+
self.prepModel()
|
| 146 |
+
|
| 147 |
+
# prepare test data
|
| 148 |
+
featData = self.prepData("predict")[0]
|
| 149 |
+
|
| 150 |
+
#predict
|
| 151 |
+
self.logger.info("...predicting")
|
| 152 |
+
outData = self.regressor.predict(featData)
|
| 153 |
+
return outData
|
| 154 |
+
|
| 155 |
+
def prepData(self, mode):
|
| 156 |
+
"""
|
| 157 |
+
loads and prepares data for training and validation
|
| 158 |
+
"""
|
| 159 |
+
# parameters
|
| 160 |
+
key = mode + ".data.file"
|
| 161 |
+
dataFile = self.config.getStringConfig(key)[0]
|
| 162 |
+
|
| 163 |
+
key = mode + ".data.fields"
|
| 164 |
+
fieldIndices = self.config.getStringConfig(key)[0]
|
| 165 |
+
if not fieldIndices is None:
|
| 166 |
+
fieldIndices = strToIntArray(fieldIndices, ",")
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
key = mode + ".data.feature.fields"
|
| 170 |
+
featFieldIndices = self.config.getStringConfig(key)[0]
|
| 171 |
+
if not featFieldIndices is None:
|
| 172 |
+
featFieldIndices = strToIntArray(featFieldIndices, ",")
|
| 173 |
+
|
| 174 |
+
if not mode == "predict":
|
| 175 |
+
key = mode + ".data.out.field"
|
| 176 |
+
outFieldIndex = self.config.getIntConfig(key)[0]
|
| 177 |
+
|
| 178 |
+
#load data
|
| 179 |
+
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
|
| 180 |
+
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
| 181 |
+
featData = sk.preprocessing.scale(featData)
|
| 182 |
+
outData = None
|
| 183 |
+
if not mode == "predict":
|
| 184 |
+
outData = extrColumns(data, outFieldIndex)
|
| 185 |
+
return (featData, outData)
|
| 186 |
+
|
| 187 |
+
def prepModel(self):
|
| 188 |
+
"""
|
| 189 |
+
load saved model or train model
|
| 190 |
+
"""
|
| 191 |
+
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
|
| 192 |
+
if (useSavedModel and not self.regressor):
|
| 193 |
+
# load saved model
|
| 194 |
+
self.logger.info("...loading saved model")
|
| 195 |
+
modelFilePath = self.getModelFilePath()
|
| 196 |
+
self.regressor = joblib.load(modelFilePath)
|
| 197 |
+
else:
|
| 198 |
+
# train model
|
| 199 |
+
self.train()
|
| 200 |
+
|
| 201 |
+
class LinearRegressor(BaseRegressor):
|
| 202 |
+
"""
|
| 203 |
+
linear regression
|
| 204 |
+
"""
|
| 205 |
+
def __init__(self, configFile):
|
| 206 |
+
defValues = {}
|
| 207 |
+
defValues["train.normalize"] = (False, None)
|
| 208 |
+
|
| 209 |
+
super(LinearRegressor, self).__init__(configFile, defValues)
|
| 210 |
+
|
| 211 |
+
def buildModel(self):
|
| 212 |
+
"""
|
| 213 |
+
builds model object
|
| 214 |
+
"""
|
| 215 |
+
self.logger.info("...building linear regression model")
|
| 216 |
+
normalize = self.config.getBooleanConfig("train.normalize")[0]
|
| 217 |
+
self.regressor = LinearRegression(normalize=normalize)
|
| 218 |
+
|
| 219 |
+
class ElasticNetRegressor(BaseRegressor):
|
| 220 |
+
"""
|
| 221 |
+
elastic net regression
|
| 222 |
+
"""
|
| 223 |
+
def __init__(self, configFile):
|
| 224 |
+
defValues = {}
|
| 225 |
+
defValues["train.alpha"] = (1.0, None)
|
| 226 |
+
defValues["train.loneratio"] = (0.5, None)
|
| 227 |
+
defValues["train.normalize"] = (False, None)
|
| 228 |
+
defValues["train.precompute"] = (False, None)
|
| 229 |
+
defValues["train.max.iter"] = (1000, None)
|
| 230 |
+
defValues["train.tol"] = (0.0001, None)
|
| 231 |
+
defValues["train.random.state"] = (None, None)
|
| 232 |
+
defValues["train.selection"] = ("cyclic", None)
|
| 233 |
+
|
| 234 |
+
super(ElasticNetRegressor, self).__init__(configFile, defValues)
|
| 235 |
+
|
| 236 |
+
def buildModel(self):
|
| 237 |
+
"""
|
| 238 |
+
builds model object
|
| 239 |
+
"""
|
| 240 |
+
self.logger.info("...building elastic net regression model")
|
| 241 |
+
alpha = self.config.getFloatConfig("train.alpha")[0]
|
| 242 |
+
loneratio = self.config.getFloatConfig("train.loneratio")[0]
|
| 243 |
+
normalize = self.config.getBooleanConfig("train.normalize")[0]
|
| 244 |
+
precompute = self.config.getBooleanConfig("train.precompute")[0]
|
| 245 |
+
maxIter = self.config.getIntConfig("train.max.iter")[0]
|
| 246 |
+
tol = self.config.getFloatConfig("train.tol")[0]
|
| 247 |
+
randState = self.config.getIntConfig("train.random.state")[0]
|
| 248 |
+
selection = self.config.getIntConfig("train.selection")[0]
|
| 249 |
+
|
| 250 |
+
self.regressor = ElasticNet(alpha=alpha, l1_ratio=loneratio, normalize=normalize, precompute=precompute,
|
| 251 |
+
max_iter=maxIter, tol=tol, random_state=randState, selection=selection)
|
| 252 |
+
|
| 253 |
+
|
supv/rf.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import numpy as np
|
| 23 |
+
import sklearn as sk
|
| 24 |
+
import matplotlib
|
| 25 |
+
import random
|
| 26 |
+
import jprops
|
| 27 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 28 |
+
from random import randint
|
| 29 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 30 |
+
from util import *
|
| 31 |
+
from mlutil import *
|
| 32 |
+
from pasearch import *
|
| 33 |
+
from bacl import *
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# gradient boosting classification
|
| 37 |
+
class RandomForest(BaseClassifier):
|
| 38 |
+
def __init__(self, configFile):
|
| 39 |
+
defValues = {}
|
| 40 |
+
defValues["common.mode"] = ("training", None)
|
| 41 |
+
defValues["common.model.directory"] = ("model", None)
|
| 42 |
+
defValues["common.model.file"] = (None, None)
|
| 43 |
+
defValues["common.preprocessing"] = (None, None)
|
| 44 |
+
defValues["common.verbose"] = (False, None)
|
| 45 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
| 46 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
| 47 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
| 48 |
+
defValues["train.data.class.field"] = (None, "missing class field ordinal")
|
| 49 |
+
defValues["train.validation"] = ("kfold", None)
|
| 50 |
+
defValues["train.num.folds"] = (5, None)
|
| 51 |
+
defValues["train.num.trees"] = (100, None)
|
| 52 |
+
defValues["train.split.criterion"] = ("gini", None)
|
| 53 |
+
defValues["train.max.depth"] = (None, None)
|
| 54 |
+
defValues["train.min.samples.split"] = (4, None)
|
| 55 |
+
defValues["train.min.samples.leaf"] = (2, None)
|
| 56 |
+
defValues["train.min.weight.fraction.leaf"] = (0, None)
|
| 57 |
+
defValues["train.max.features"] = ("auto", None)
|
| 58 |
+
defValues["train.max.leaf.nodes"] = (None, None)
|
| 59 |
+
defValues["train.min.impurity.decrease"] = (0, None)
|
| 60 |
+
defValues["train.min.impurity.split"] = (1.0e-07, None)
|
| 61 |
+
defValues["train.bootstrap"] = (True, None)
|
| 62 |
+
defValues["train.oob.score"] = (False, None)
|
| 63 |
+
defValues["train.num.jobs"] = (1, None)
|
| 64 |
+
defValues["train.random.state"] = (None, None)
|
| 65 |
+
defValues["train.verbose"] = (0, None)
|
| 66 |
+
defValues["train.warm.start"] = (False, None)
|
| 67 |
+
defValues["train.success.criterion"] = ("error", None)
|
| 68 |
+
defValues["train.model.save"] = (False, None)
|
| 69 |
+
defValues["train.score.method"] = ("accuracy", None)
|
| 70 |
+
defValues["train.search.param.strategy"] = (None, None)
|
| 71 |
+
defValues["train.search.params"] = (None, None)
|
| 72 |
+
defValues["predict.data.file"] = (None, None)
|
| 73 |
+
defValues["predict.data.fields"] = (None, "missing data field ordinals")
|
| 74 |
+
defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
|
| 75 |
+
defValues["predict.use.saved.model"] = (False, None)
|
| 76 |
+
defValues["validate.data.file"] = (None, "missing validation data file")
|
| 77 |
+
defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
|
| 78 |
+
defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
|
| 79 |
+
defValues["validate.data.class.field"] = (None, "missing class field ordinal")
|
| 80 |
+
defValues["validate.use.saved.model"] = (False, None)
|
| 81 |
+
defValues["validate.score.method"] = ("accuracy", None)
|
| 82 |
+
|
| 83 |
+
super(RandomForest, self).__init__(configFile, defValues, __name__)
|
| 84 |
+
|
| 85 |
+
# builds model object
|
| 86 |
+
def buildModel(self):
|
| 87 |
+
self.logger.info("...building random forest model")
|
| 88 |
+
numTrees = self.config.getIntConfig("train.num.trees")[0]
|
| 89 |
+
splitCriterion = self.config.getStringConfig("train.split.criterion")[0]
|
| 90 |
+
maxDepth = self.config.getStringConfig("train.max.depth")[0]
|
| 91 |
+
maxDepth = typedValue(maxDepth)
|
| 92 |
+
minSamplesSplit = self.config.getStringConfig("train.min.samples.split")[0]
|
| 93 |
+
minSamplesSplit = typedValue(minSamplesSplit)
|
| 94 |
+
minSamplesLeaf = self.config.getStringConfig("train.min.samples.leaf")[0]
|
| 95 |
+
minSamplesLeaf = typedValue(minSamplesLeaf)
|
| 96 |
+
minWeightFractionLeaf = self.config.getFloatConfig("train.min.weight.fraction.leaf")[0]
|
| 97 |
+
maxFeatures = self.config.getStringConfig("train.max.features")[0]
|
| 98 |
+
maxFeatures = typedValue(maxFeatures)
|
| 99 |
+
maxLeafNodes = self.config.getIntConfig("train.max.leaf.nodes")[0]
|
| 100 |
+
minImpurityDecrease = self.config.getFloatConfig("train.min.impurity.decrease")[0]
|
| 101 |
+
minImpurityDecrease = self.config.getFloatConfig("train.min.impurity.split")[0]
|
| 102 |
+
bootstrap = self.config.getBooleanConfig("train.bootstrap")[0]
|
| 103 |
+
oobScore = self.config.getBooleanConfig("train.oob.score")[0]
|
| 104 |
+
numJobs = self.config.getIntConfig("train.num.jobs")[0]
|
| 105 |
+
randomState = self.config.getIntConfig("train.random.state")[0]
|
| 106 |
+
verbose = self.config.getIntConfig("train.verbose")[0]
|
| 107 |
+
warmStart = self.config.getBooleanConfig("train.warm.start")[0]
|
| 108 |
+
|
| 109 |
+
model = RandomForestClassifier(n_estimators=numTrees, criterion=splitCriterion, max_depth=maxDepth, \
|
| 110 |
+
min_samples_split=minSamplesSplit, min_samples_leaf=minSamplesLeaf, min_weight_fraction_leaf=minWeightFractionLeaf, \
|
| 111 |
+
max_features=maxFeatures, max_leaf_nodes=maxLeafNodes, min_impurity_decrease=minImpurityDecrease, \
|
| 112 |
+
min_impurity_split=None, bootstrap=bootstrap, oob_score=oobScore, n_jobs=numJobs, random_state=randomState, \
|
| 113 |
+
verbose=verbose, warm_start=warmStart, class_weight=None)
|
| 114 |
+
self.classifier = model
|
| 115 |
+
return self.classifier
|
| 116 |
+
|
| 117 |
+
#predict probability with in memory data
|
| 118 |
+
def predictProb(self, recs):
|
| 119 |
+
# create model
|
| 120 |
+
self.prepModel()
|
| 121 |
+
|
| 122 |
+
#input record
|
| 123 |
+
if type(recs) is str:
|
| 124 |
+
featData = self.prepStringPredictData(recs)
|
| 125 |
+
else:
|
| 126 |
+
featData = recs
|
| 127 |
+
if (featData.ndim == 1):
|
| 128 |
+
featData = featData.reshape(1, -1)
|
| 129 |
+
|
| 130 |
+
#predict
|
| 131 |
+
self.logger.info("...predicting class probability")
|
| 132 |
+
clsData = self.classifier.predict_proba(featData)
|
| 133 |
+
return clsData
|
| 134 |
+
|
supv/svm.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import numpy as np
|
| 23 |
+
import sklearn as sk
|
| 24 |
+
import sklearn.linear_model
|
| 25 |
+
import matplotlib
|
| 26 |
+
import random
|
| 27 |
+
import jprops
|
| 28 |
+
from random import randint
|
| 29 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 30 |
+
from util import *
|
| 31 |
+
from mlutil import *
|
| 32 |
+
from pasearch import *
|
| 33 |
+
from bacl import *
|
| 34 |
+
|
| 35 |
+
# gradient boosting classification
|
| 36 |
+
class SupportVectorMachine(BaseClassifier):
|
| 37 |
+
|
| 38 |
+
def __init__(self, configFile):
|
| 39 |
+
defValues = {}
|
| 40 |
+
defValues["common.mode"] = ("train", None)
|
| 41 |
+
defValues["common.model.directory"] = ("model", None)
|
| 42 |
+
defValues["common.model.file"] = (None, None)
|
| 43 |
+
defValues["common.scale.file.path"] = (None, "missing scale file path")
|
| 44 |
+
defValues["common.preprocessing"] = (None, None)
|
| 45 |
+
defValues["common.verbose"] = (False, None)
|
| 46 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
| 47 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
| 48 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
| 49 |
+
defValues["train.data.class.field"] = (None, "missing class field ordinal")
|
| 50 |
+
defValues["train.validation"] = ("kfold", None)
|
| 51 |
+
defValues["train.num.folds"] = (5, None)
|
| 52 |
+
defValues["train.algorithm"] = ("svc", None)
|
| 53 |
+
defValues["train.kernel.function"] = ("rbf", None)
|
| 54 |
+
defValues["train.poly.degree"] = (3, None)
|
| 55 |
+
defValues["train.penalty"] = (1.0, None)
|
| 56 |
+
defValues["train.gamma"] = ("scale", None)
|
| 57 |
+
defValues["train.penalty.norm"] = ("l2", None)
|
| 58 |
+
defValues["train.loss"] = ("squared_hinge", None)
|
| 59 |
+
defValues["train.dual"] = (True, None)
|
| 60 |
+
defValues["train.shrinking"] = (True, None)
|
| 61 |
+
defValues["train.nu"] = (0.5, None)
|
| 62 |
+
defValues["train.predict.probability"] = (False, None)
|
| 63 |
+
defValues["train.print.sup.vectors"] = (False, None)
|
| 64 |
+
defValues["train.success.criterion"] = ("error", None)
|
| 65 |
+
defValues["train.model.save"] = (False, None)
|
| 66 |
+
defValues["train.score.method"] = ("accuracy", None)
|
| 67 |
+
defValues["train.search.param.strategy"] = (None, None)
|
| 68 |
+
defValues["train.search.params"] = (None, None)
|
| 69 |
+
defValues["predict.data.file"] = (None, None)
|
| 70 |
+
defValues["predict.data.fields"] = (None, "missing data field ordinals")
|
| 71 |
+
defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
|
| 72 |
+
defValues["predict.use.saved.model"] = (False, None)
|
| 73 |
+
defValues["validate.data.file"] = (None, "missing validation data file")
|
| 74 |
+
defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
|
| 75 |
+
defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
|
| 76 |
+
defValues["validate.data.class.field"] = (None, "missing class field ordinal")
|
| 77 |
+
defValues["validate.use.saved.model"] = (False, None)
|
| 78 |
+
defValues["validate.score.method"] = ("accuracy", None)
|
| 79 |
+
|
| 80 |
+
super(SupportVectorMachine, self).__init__(configFile, defValues, __name__)
|
| 81 |
+
|
| 82 |
+
# builds model object
|
| 83 |
+
def buildModel(self):
|
| 84 |
+
self.logger.info("...building svm model")
|
| 85 |
+
algo = self.config.getStringConfig("train.algorithm")[0]
|
| 86 |
+
kernelFun = self.config.getStringConfig("train.kernel.function")[0]
|
| 87 |
+
penalty = self.config.getFloatConfig("train.penalty")[0]
|
| 88 |
+
polyDegree = self.config.getIntConfig("train.poly.degree")[0]
|
| 89 |
+
kernelCoeff = self.config.getStringConfig("train.gamma")[0]
|
| 90 |
+
kernelCoeff = typedValue(kernelCoeff)
|
| 91 |
+
penaltyNorm = self.config.getStringConfig("train.penalty.norm")[0]
|
| 92 |
+
trainLoss = self.config.getStringConfig("train.loss")[0]
|
| 93 |
+
dualOpt = self.config.getBooleanConfig("train.dual")[0]
|
| 94 |
+
shrinkHeuristic = self.config.getBooleanConfig("train.shrinking")[0]
|
| 95 |
+
predictProb = self.config.getBooleanConfig("train.predict.probability")[0]
|
| 96 |
+
supVecBound = self.config.getFloatConfig("train.nu")[0]
|
| 97 |
+
|
| 98 |
+
if (algo == "svc"):
|
| 99 |
+
if kernelFun == "poly":
|
| 100 |
+
model = sk.svm.SVC(C=penalty,kernel=kernelFun,degree=polyDegree,gamma=kernelCoeff, shrinking=shrinkHeuristic, \
|
| 101 |
+
probability=predictProb)
|
| 102 |
+
elif kernelFun == "rbf" or kernelFun == "sigmoid":
|
| 103 |
+
model = sk.svm.SVC(C=penalty,kernel=kernelFun,gamma=kernelCoeff, shrinking=shrinkHeuristic, probability=predictProb)
|
| 104 |
+
else:
|
| 105 |
+
model = sk.svm.SVC(C=penalty, kernel=kernelFun, shrinking=shrinkHeuristic, probability=predictProb)
|
| 106 |
+
elif (algo == "nusvc"):
|
| 107 |
+
if kernelFun == "poly":
|
| 108 |
+
model = sk.svm.NuSVC(nu=supVecBound, kernel=kernelFun,degree=polyDegree,gamma=kernelCoeff, shrinking=shrinkHeuristic, \
|
| 109 |
+
probability=predictProb)
|
| 110 |
+
elif kernelFun == "rbf" or kernelFun == "sigmoid":
|
| 111 |
+
model = sk.svm.NuSVC(nu=supVecBound, kernel=kernelFun,gamma=kernelCoeff, shrinking=shrinkHeuristic, probability=predictProb)
|
| 112 |
+
else:
|
| 113 |
+
model = sk.svm.NuSVC(nu=supVecBound, kernel=kernelFun, shrinking=shrinkHeuristic, probability=predictProb)
|
| 114 |
+
elif (algo == "linearsvc"):
|
| 115 |
+
model = sk.svm.LinearSVC(penalty=penaltyNorm, loss=trainLoss, dual=dualOpt)
|
| 116 |
+
else:
|
| 117 |
+
self.logger.info("invalid svm algorithm")
|
| 118 |
+
sys.exit()
|
| 119 |
+
self.classifier = model
|
| 120 |
+
return self.classifier
|
| 121 |
+
|
| 122 |
+
#predict probability with in memory data
|
| 123 |
+
def predictProb(self, recs):
|
| 124 |
+
# create model
|
| 125 |
+
self.prepModel()
|
| 126 |
+
|
| 127 |
+
#input record
|
| 128 |
+
if type(recs) is str:
|
| 129 |
+
featData = self.prepStringPredictData(recs)
|
| 130 |
+
else:
|
| 131 |
+
featData = recs
|
| 132 |
+
if (featData.ndim == 1):
|
| 133 |
+
featData = featData.reshape(1, -1)
|
| 134 |
+
|
| 135 |
+
#predict
|
| 136 |
+
self.logger.info("...predicting class probability")
|
| 137 |
+
clsData = self.classifier.predict_proba(featData)
|
| 138 |
+
return clsData
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
supv/svml.py
ADDED
|
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/Users/pranab/Tools/anaconda/bin/python
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import numpy as np
|
| 23 |
+
import sklearn as sk
|
| 24 |
+
import sklearn.linear_model
|
| 25 |
+
import matplotlib
|
| 26 |
+
import random
|
| 27 |
+
import jprops
|
| 28 |
+
from sklearn.externals import joblib
|
| 29 |
+
from sklearn.ensemble import BaggingClassifier
|
| 30 |
+
from random import randint
|
| 31 |
+
|
| 32 |
+
if len(sys.argv) < 2:
|
| 33 |
+
print "usage: ./svm.py <config_properties_file>"
|
| 34 |
+
sys.exit()
|
| 35 |
+
|
| 36 |
+
#train by bagging
|
| 37 |
+
def train_bagging():
|
| 38 |
+
model = build_model()
|
| 39 |
+
bagging_model = BaggingClassifier(base_estimator=model,n_estimators=bagging_num_estimator,
|
| 40 |
+
max_samples=bagging_sample_fraction,oob_score=bagging_use_oob)
|
| 41 |
+
|
| 42 |
+
#train model
|
| 43 |
+
bagging_model.fit(XC, yc)
|
| 44 |
+
|
| 45 |
+
#persist model
|
| 46 |
+
if persist_model:
|
| 47 |
+
models = bagging_model.estimators_
|
| 48 |
+
for m in zip(range(0, len(models)), models):
|
| 49 |
+
model_file = model_file_directory + "/" + model_file_prefix + "_" + str(m[0] + 1) + ".mod"
|
| 50 |
+
joblib.dump(m[1], model_file)
|
| 51 |
+
|
| 52 |
+
score = bagging_model.score(XC, yc)
|
| 53 |
+
print "average error %.3f" %(1.0 - score)
|
| 54 |
+
|
| 55 |
+
#linear k fold validation
|
| 56 |
+
def train_kfold_validation(nfold):
|
| 57 |
+
if native_kfold_validation:
|
| 58 |
+
print "native linear kfold validation"
|
| 59 |
+
model = build_model()
|
| 60 |
+
scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold)
|
| 61 |
+
av_score = np.mean(scores)
|
| 62 |
+
print "average error %.3f" %(1.0 - av_score)
|
| 63 |
+
else:
|
| 64 |
+
print "extended linear kfold validation"
|
| 65 |
+
train_kfold_validation_ext(nfold)
|
| 66 |
+
|
| 67 |
+
#linear k fold validation
|
| 68 |
+
def train_kfold_validation_ext(nfold):
|
| 69 |
+
model = build_model()
|
| 70 |
+
#scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold)
|
| 71 |
+
#print scores
|
| 72 |
+
|
| 73 |
+
offset = 0
|
| 74 |
+
length = dsize / nfold
|
| 75 |
+
errors = []
|
| 76 |
+
fp_errors = []
|
| 77 |
+
fn_errors = []
|
| 78 |
+
for i in range(0, nfold):
|
| 79 |
+
print "....Next fold %d" %(i)
|
| 80 |
+
|
| 81 |
+
#split data
|
| 82 |
+
(XV,yv,X,y) = split_data(offset, length)
|
| 83 |
+
dvsize = len(XV)
|
| 84 |
+
|
| 85 |
+
#train model
|
| 86 |
+
model.fit(X, y)
|
| 87 |
+
|
| 88 |
+
#persist model
|
| 89 |
+
if persist_model:
|
| 90 |
+
model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
|
| 91 |
+
joblib.dump(model, model_file)
|
| 92 |
+
|
| 93 |
+
#print support vectors
|
| 94 |
+
print_support_vectors(model)
|
| 95 |
+
|
| 96 |
+
#predict
|
| 97 |
+
print "making predictions..."
|
| 98 |
+
yp = model.predict(XV)
|
| 99 |
+
|
| 100 |
+
#show prediction output
|
| 101 |
+
(er, fp_er, fn_er) = validate(dvsize,yv,yp)
|
| 102 |
+
errors.append(er)
|
| 103 |
+
fp_errors.append(fp_er)
|
| 104 |
+
fn_errors.append(fn_er)
|
| 105 |
+
|
| 106 |
+
offset += length
|
| 107 |
+
|
| 108 |
+
#average error
|
| 109 |
+
av_error = np.mean(errors)
|
| 110 |
+
av_fp_error = np.mean(fp_errors)
|
| 111 |
+
av_fn_error = np.mean(fn_errors)
|
| 112 |
+
print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error)
|
| 113 |
+
|
| 114 |
+
# random k fold validation
|
| 115 |
+
def train_rfold_validation(nfold, niter):
|
| 116 |
+
if native_rfold_validation:
|
| 117 |
+
print "native random kfold validation"
|
| 118 |
+
train_fraction = 1.0 / nfold
|
| 119 |
+
scores = []
|
| 120 |
+
for i in range(0,niter):
|
| 121 |
+
state = randint(1,100)
|
| 122 |
+
X, XV, y, yv = sk.cross_validation.train_test_split(XC, yc, test_size=train_fraction, random_state=state)
|
| 123 |
+
model = build_model()
|
| 124 |
+
model.fit(X,y)
|
| 125 |
+
scores.append(model.score(XV, yv))
|
| 126 |
+
|
| 127 |
+
print scores
|
| 128 |
+
av_score = np.mean(scores)
|
| 129 |
+
print "average error %.3f" %(1.0 - av_score)
|
| 130 |
+
|
| 131 |
+
else:
|
| 132 |
+
print "extended random kfold validation"
|
| 133 |
+
train_rfold_validation_ext(nfold, niter)
|
| 134 |
+
|
| 135 |
+
# random k fold validation
|
| 136 |
+
def train_rfold_validation_ext(nfold, niter):
|
| 137 |
+
max_offset_frac = 1.0 - 1.0 / nfold
|
| 138 |
+
max_offset_frac -= .01
|
| 139 |
+
length = dsize / nfold
|
| 140 |
+
|
| 141 |
+
errors = []
|
| 142 |
+
fp_errors = []
|
| 143 |
+
fn_errors = []
|
| 144 |
+
for i in range(0,niter):
|
| 145 |
+
print "...Next iteration %d" %(i)
|
| 146 |
+
offset = int(dsize * random.random() * max_offset_frac)
|
| 147 |
+
print "offset: %d length: %d" %(offset, length)
|
| 148 |
+
(XV,yv,X,y) = split_data(offset, length)
|
| 149 |
+
dvsize = len(XV)
|
| 150 |
+
|
| 151 |
+
#build model
|
| 152 |
+
model = build_model()
|
| 153 |
+
|
| 154 |
+
#train model
|
| 155 |
+
model.fit(X, y)
|
| 156 |
+
|
| 157 |
+
#persist model
|
| 158 |
+
if persist_model:
|
| 159 |
+
model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
|
| 160 |
+
print "saving model file " + model_file
|
| 161 |
+
joblib.dump(model, model_file)
|
| 162 |
+
|
| 163 |
+
#print support vectors
|
| 164 |
+
print_support_vectors(model)
|
| 165 |
+
|
| 166 |
+
#predict
|
| 167 |
+
print "making predictions..."
|
| 168 |
+
yp = model.predict(XV)
|
| 169 |
+
|
| 170 |
+
#show prediction output
|
| 171 |
+
(er, fp_er, fn_er) = validate(dvsize,yv,yp)
|
| 172 |
+
errors.append(er)
|
| 173 |
+
fp_errors.append(fp_er)
|
| 174 |
+
fn_errors.append(fn_er)
|
| 175 |
+
|
| 176 |
+
av_error = np.mean(errors)
|
| 177 |
+
av_fp_error = np.mean(fp_errors)
|
| 178 |
+
av_fn_error = np.mean(fn_errors)
|
| 179 |
+
print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error)
|
| 180 |
+
|
| 181 |
+
# make predictions
|
| 182 |
+
def predict():
|
| 183 |
+
psize = len(X)
|
| 184 |
+
class_counts = []
|
| 185 |
+
|
| 186 |
+
#all models
|
| 187 |
+
for i in range(0, num_models):
|
| 188 |
+
model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
|
| 189 |
+
print "loading model file " + model_file
|
| 190 |
+
model = joblib.load(model_file)
|
| 191 |
+
|
| 192 |
+
yp = model.predict(X)
|
| 193 |
+
if i == 0:
|
| 194 |
+
#initialize class counts
|
| 195 |
+
for y in yp:
|
| 196 |
+
class_count = {}
|
| 197 |
+
if y == 0:
|
| 198 |
+
class_count[0] = 1
|
| 199 |
+
class_count[1] = 0
|
| 200 |
+
else:
|
| 201 |
+
class_count[1] = 1
|
| 202 |
+
class_count[0] = 0
|
| 203 |
+
class_counts.append(class_count)
|
| 204 |
+
|
| 205 |
+
else:
|
| 206 |
+
#increment class count
|
| 207 |
+
for j in range(0, psize):
|
| 208 |
+
class_count = class_counts[j]
|
| 209 |
+
y = yp[j]
|
| 210 |
+
class_count[y] += 1
|
| 211 |
+
|
| 212 |
+
# predict based on majority vote
|
| 213 |
+
print "here are the predictions"
|
| 214 |
+
for k in range(0, psize):
|
| 215 |
+
class_count = class_counts[k]
|
| 216 |
+
if (class_count[0] > class_count[1]):
|
| 217 |
+
y = 0
|
| 218 |
+
majority = class_count[0]
|
| 219 |
+
else:
|
| 220 |
+
y = 1
|
| 221 |
+
majority = class_count[1]
|
| 222 |
+
|
| 223 |
+
print X[k]
|
| 224 |
+
print "prediction %d majority count %d" %(y, majority)
|
| 225 |
+
|
| 226 |
+
#builds model
|
| 227 |
+
def build_model():
|
| 228 |
+
#build model
|
| 229 |
+
print "building model..."
|
| 230 |
+
if algo == "svc":
|
| 231 |
+
if kernel_fun == "poly":
|
| 232 |
+
model = sk.svm.SVC(C=penalty,kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff)
|
| 233 |
+
elif kernel_fun == "rbf" or kernel_fun == "sigmoid":
|
| 234 |
+
model = sk.svm.SVC(C=penalty,kernel=kernel_fun,gamma=kernel_coeff)
|
| 235 |
+
else:
|
| 236 |
+
model = sk.svm.SVC(C=penalty,kernel=kernel_fun)
|
| 237 |
+
elif algo == "nusvc":
|
| 238 |
+
if kernel_fun == "poly":
|
| 239 |
+
model = sk.svm.NuSVC(kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff)
|
| 240 |
+
elif kernel_fun == "rbf" or kernel_fun == "sigmoid":
|
| 241 |
+
model = sk.svm.NuSVC(kernel=kernel_fun,gamma=kernel_coeff)
|
| 242 |
+
else:
|
| 243 |
+
model = sk.svm.NuSVC(kernel=kernel_fun)
|
| 244 |
+
elif algo == "linearsvc":
|
| 245 |
+
model = sk.svm.LinearSVC()
|
| 246 |
+
else:
|
| 247 |
+
print "invalid svm algorithm"
|
| 248 |
+
sys.exit()
|
| 249 |
+
return model
|
| 250 |
+
|
| 251 |
+
#splits data into training and validation sets
|
| 252 |
+
def split_data(offset, length):
|
| 253 |
+
print "splitting data..."
|
| 254 |
+
#copy data
|
| 255 |
+
XC_c = np.copy(XC)
|
| 256 |
+
yc_c = list(yc)
|
| 257 |
+
|
| 258 |
+
# validation set
|
| 259 |
+
vlo = offset
|
| 260 |
+
vup = vlo + length
|
| 261 |
+
if (vup > len(yc)):
|
| 262 |
+
vup = len(yc)
|
| 263 |
+
XV = XC_c[vlo:vup:1]
|
| 264 |
+
yv = yc_c[vlo:vup:1]
|
| 265 |
+
dvsize = len(XV)
|
| 266 |
+
print "data size %d validation data size %d" %(dsize, dvsize)
|
| 267 |
+
#print "validation set"
|
| 268 |
+
#print XV
|
| 269 |
+
#print yv
|
| 270 |
+
|
| 271 |
+
#training set
|
| 272 |
+
X = np.delete(XC_c, np.s_[vlo:vup:1], 0)
|
| 273 |
+
y = np.delete(yc_c, np.s_[vlo:vup:1], 0)
|
| 274 |
+
#print "training set"
|
| 275 |
+
#print X
|
| 276 |
+
#print y
|
| 277 |
+
return (XV,yv,X,y)
|
| 278 |
+
|
| 279 |
+
#print support vectors
|
| 280 |
+
def print_support_vectors(model):
|
| 281 |
+
if (not algo == "linearsvc"):
|
| 282 |
+
if print_sup_vectors:
|
| 283 |
+
print "showing support vectors..."
|
| 284 |
+
print model.support_vectors_
|
| 285 |
+
print "num of support vectors"
|
| 286 |
+
print model.n_support_
|
| 287 |
+
|
| 288 |
+
#prints prediction output
|
| 289 |
+
def validate(dvsize,yv,yp):
|
| 290 |
+
print "showing predictions..."
|
| 291 |
+
err_count = 0
|
| 292 |
+
tp = 0
|
| 293 |
+
tn = 0
|
| 294 |
+
fp = 0
|
| 295 |
+
fn = 0
|
| 296 |
+
for r in range(0,dvsize):
|
| 297 |
+
#print "actual: %d predicted: %d" %(yv[r], yp[r])
|
| 298 |
+
if (not yv[r] == yp[r]):
|
| 299 |
+
err_count += 1
|
| 300 |
+
|
| 301 |
+
if (yp[r] == 1 and yv[r] == 1):
|
| 302 |
+
tp += 1
|
| 303 |
+
elif (yp[r] == 1 and yv[r] == 0):
|
| 304 |
+
fp += 1
|
| 305 |
+
elif (yp[r] == 0 and yv[r] == 0):
|
| 306 |
+
tn += 1
|
| 307 |
+
else:
|
| 308 |
+
fn += 1
|
| 309 |
+
|
| 310 |
+
er = float(err_count) / dvsize
|
| 311 |
+
fp_er = float(fp) / dvsize
|
| 312 |
+
fn_er = float(fn) / dvsize
|
| 313 |
+
print "error %.3f" %(er)
|
| 314 |
+
print "true positive : %.3f" %(float(tp) / dvsize)
|
| 315 |
+
print "false positive: %.3f" %(fp_er)
|
| 316 |
+
print "true negative : %.3f" %(float(tn) / dvsize)
|
| 317 |
+
print "false negative: %.3f" %(fn_er)
|
| 318 |
+
|
| 319 |
+
return (er, fp_er, fn_er)
|
| 320 |
+
|
| 321 |
+
# load configuration
|
| 322 |
+
def getConfigs(configFile):
|
| 323 |
+
configs = {}
|
| 324 |
+
print "using following configurations"
|
| 325 |
+
with open(configFile) as fp:
|
| 326 |
+
for key, value in jprops.iter_properties(fp):
|
| 327 |
+
print key, value
|
| 328 |
+
configs[key] = value
|
| 329 |
+
|
| 330 |
+
return configs
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# load configuration
|
| 334 |
+
configs = getConfigs(sys.argv[1])
|
| 335 |
+
mode = configs["common.mode"]
|
| 336 |
+
|
| 337 |
+
if mode == "train":
|
| 338 |
+
#train
|
| 339 |
+
print "running in train mode"
|
| 340 |
+
data_file = configs["train.data.file"]
|
| 341 |
+
feat_field_indices = configs["train.data.feature.fields"].split(",")
|
| 342 |
+
feat_field_indices = [int(a) for a in feat_field_indices]
|
| 343 |
+
class_field_index = int(configs["train.data.class.field"])
|
| 344 |
+
preprocess = configs["common.preprocessing"]
|
| 345 |
+
validation = configs["train.validation"]
|
| 346 |
+
num_folds = int(configs["train.num.folds"])
|
| 347 |
+
num_iter = int(configs["train.num.iter"])
|
| 348 |
+
algo = configs["train.algorithm"]
|
| 349 |
+
kernel_fun = configs["train.kernel.function"]
|
| 350 |
+
poly_degree = int(configs["train.poly.degree"])
|
| 351 |
+
penalty = float(configs["train.penalty"])
|
| 352 |
+
if penalty < 0:
|
| 353 |
+
penalty = 1.0
|
| 354 |
+
print "using default for penalty"
|
| 355 |
+
kernel_coeff = float(configs["train.gamma"])
|
| 356 |
+
if kernel_coeff < 0:
|
| 357 |
+
kernel_coeff = 'auto'
|
| 358 |
+
print "using default for gamma"
|
| 359 |
+
print_sup_vectors = configs["train.print.sup.vectors"].lower() == "true"
|
| 360 |
+
persist_model = configs["train.persist.model"].lower() == "true"
|
| 361 |
+
model_file_directory = configs["common.model.directory"]
|
| 362 |
+
model_file_prefix = configs["common.model.file.prefix"]
|
| 363 |
+
|
| 364 |
+
print feat_field_indices
|
| 365 |
+
|
| 366 |
+
#extract feature fields
|
| 367 |
+
d = np.loadtxt(data_file, delimiter=',')
|
| 368 |
+
dsize = len(d)
|
| 369 |
+
XC = d[:,feat_field_indices]
|
| 370 |
+
|
| 371 |
+
#preprocess features
|
| 372 |
+
if (preprocess == "scale"):
|
| 373 |
+
XC = sk.preprocessing.scale(XC)
|
| 374 |
+
elif (preprocess == "normalize"):
|
| 375 |
+
XC = sk.preprocessing.normalize(XC, norm='l2')
|
| 376 |
+
else:
|
| 377 |
+
print "no preprocessing done"
|
| 378 |
+
|
| 379 |
+
#extract output field
|
| 380 |
+
yc = d[:,[class_field_index]]
|
| 381 |
+
yc = yc.reshape(dsize)
|
| 382 |
+
yc = [int(a) for a in yc]
|
| 383 |
+
|
| 384 |
+
#print XC
|
| 385 |
+
#print yc
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
# train model
|
| 389 |
+
if validation == "kfold":
|
| 390 |
+
native_kfold_validation = configs["train.native.kfold.validation"].lower() == "true"
|
| 391 |
+
train_kfold_validation(num_folds)
|
| 392 |
+
elif validation == "rfold":
|
| 393 |
+
native_rfold_validation = configs["train.native.rfold.validation"].lower() == "true"
|
| 394 |
+
train_rfold_validation(num_folds,num_iter)
|
| 395 |
+
elif validation == "bagging":
|
| 396 |
+
bagging_num_estimator = int(configs["train.bagging.num.estimators"])
|
| 397 |
+
bagging_sample_fraction = float(configs["train.bagging.sample.fraction"])
|
| 398 |
+
bagging_use_oob = configs["train.bagging.sample.fraction"].lower() == "true"
|
| 399 |
+
train_bagging()
|
| 400 |
+
else:
|
| 401 |
+
print "invalid training validation method"
|
| 402 |
+
sys.exit()
|
| 403 |
+
|
| 404 |
+
else:
|
| 405 |
+
#predict
|
| 406 |
+
print "running in prediction mode"
|
| 407 |
+
pred_data_file = configs["pred.data.file"]
|
| 408 |
+
pred_feat_field_indices = configs["pred.data.feature.fields"].split(",")
|
| 409 |
+
pred_feat_field_indices = [int(a) for a in pred_feat_field_indices]
|
| 410 |
+
preprocess = configs["common.preprocessing"]
|
| 411 |
+
num_models = int(configs["pred.num.models"])
|
| 412 |
+
model_file_directory = configs["common.model.directory"]
|
| 413 |
+
model_file_prefix = configs["common.model.file.prefix"]
|
| 414 |
+
|
| 415 |
+
#extract feature fields
|
| 416 |
+
pd = np.loadtxt(pred_data_file, delimiter=',')
|
| 417 |
+
pdsize = len(pd)
|
| 418 |
+
X = pd[:,pred_feat_field_indices]
|
| 419 |
+
|
| 420 |
+
#preprocess features
|
| 421 |
+
if (preprocess == "scale"):
|
| 422 |
+
X = sk.preprocessing.scale(X)
|
| 423 |
+
elif (preprocess == "normalize"):
|
| 424 |
+
X = sk.preprocessing.normalize(X, norm='l2')
|
| 425 |
+
else:
|
| 426 |
+
print "no preprocessing done"
|
| 427 |
+
|
| 428 |
+
predict()
|
supv/tnn.py
ADDED
|
@@ -0,0 +1,789 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/local/bin/python3
|
| 2 |
+
|
| 3 |
+
# avenir-python: Machine Learning
|
| 4 |
+
# Author: Pranab Ghosh
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
| 7 |
+
# may not use this file except in compliance with the License. You may
|
| 8 |
+
# obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 15 |
+
# implied. See the License for the specific language governing
|
| 16 |
+
# permissions and limitations under the License.
|
| 17 |
+
|
| 18 |
+
# Package imports
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import numpy as np
|
| 23 |
+
import torch
|
| 24 |
+
from torch.autograd import Variable
|
| 25 |
+
from torch.utils.data import Dataset, TensorDataset
|
| 26 |
+
from torch.utils.data import DataLoader
|
| 27 |
+
import sklearn as sk
|
| 28 |
+
from sklearn.neighbors import KDTree
|
| 29 |
+
import matplotlib
|
| 30 |
+
import random
|
| 31 |
+
import jprops
|
| 32 |
+
from random import randint
|
| 33 |
+
import statistics
|
| 34 |
+
sys.path.append(os.path.abspath("../lib"))
|
| 35 |
+
from util import *
|
| 36 |
+
from mlutil import *
|
| 37 |
+
|
| 38 |
+
"""
|
| 39 |
+
forward hook function
|
| 40 |
+
"""
|
| 41 |
+
intermedOut = {}
|
| 42 |
+
lvalues = list()
|
| 43 |
+
|
| 44 |
+
def hookFn(m, i, o):
|
| 45 |
+
"""
|
| 46 |
+
call back for latent values
|
| 47 |
+
"""
|
| 48 |
+
#intermedOut[m] = o
|
| 49 |
+
lv = o.data.cpu().numpy()
|
| 50 |
+
lv = lv[0].tolist()
|
| 51 |
+
lvalues.append(lv)
|
| 52 |
+
#print(lv)
|
| 53 |
+
|
| 54 |
+
def getLatValues():
|
| 55 |
+
"""
|
| 56 |
+
"""
|
| 57 |
+
return lvalues
|
| 58 |
+
|
| 59 |
+
class FeedForwardNetwork(torch.nn.Module):
|
| 60 |
+
def __init__(self, configFile, addDefValues=None):
|
| 61 |
+
"""
|
| 62 |
+
In the constructor we instantiate two nn.Linear modules and assign them as
|
| 63 |
+
member variables.
|
| 64 |
+
|
| 65 |
+
Parameters
|
| 66 |
+
configFile : config file path
|
| 67 |
+
addDefValues : dictionary of additional default values
|
| 68 |
+
"""
|
| 69 |
+
defValues = dict() if addDefValues is None else addDefValues.copy()
|
| 70 |
+
defValues["common.mode"] = ("training", None)
|
| 71 |
+
defValues["common.model.directory"] = ("model", None)
|
| 72 |
+
defValues["common.model.file"] = (None, None)
|
| 73 |
+
defValues["common.preprocessing"] = (None, None)
|
| 74 |
+
defValues["common.scaling.method"] = ("zscale", None)
|
| 75 |
+
defValues["common.scaling.minrows"] = (50, None)
|
| 76 |
+
defValues["common.scaling.param.file"] = (None, None)
|
| 77 |
+
defValues["common.verbose"] = (False, None)
|
| 78 |
+
defValues["common.device"] = ("cpu", None)
|
| 79 |
+
defValues["train.data.file"] = (None, "missing training data file")
|
| 80 |
+
defValues["train.data.fields"] = (None, "missing training data field ordinals")
|
| 81 |
+
defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
|
| 82 |
+
defValues["train.data.out.fields"] = (None, "missing training data feature field ordinals")
|
| 83 |
+
defValues["train.layer.data"] = (None, "missing layer data")
|
| 84 |
+
defValues["train.input.size"] = (None, None)
|
| 85 |
+
defValues["train.output.size"] = (None, "missing output size")
|
| 86 |
+
defValues["train.batch.size"] = (10, None)
|
| 87 |
+
defValues["train.loss.reduction"] = ("mean", None)
|
| 88 |
+
defValues["train.num.iterations"] = (500, None)
|
| 89 |
+
defValues["train.lossFn"] = ("mse", None)
|
| 90 |
+
defValues["train.optimizer"] = ("sgd", None)
|
| 91 |
+
defValues["train.opt.learning.rate"] = (.0001, None)
|
| 92 |
+
defValues["train.opt.weight.decay"] = (0, None)
|
| 93 |
+
defValues["train.opt.momentum"] = (0, None)
|
| 94 |
+
defValues["train.opt.eps"] = (1e-08, None)
|
| 95 |
+
defValues["train.opt.dampening"] = (0, None)
|
| 96 |
+
defValues["train.opt.momentum.nesterov"] = (False, None)
|
| 97 |
+
defValues["train.opt.betas"] = ([0.9, 0.999], None)
|
| 98 |
+
defValues["train.opt.alpha"] = (0.99, None)
|
| 99 |
+
defValues["train.save.model"] = (False, None)
|
| 100 |
+
defValues["train.track.error"] = (False, None)
|
| 101 |
+
defValues["train.epoch.intv"] = (5, None)
|
| 102 |
+
defValues["train.batch.intv"] = (5, None)
|
| 103 |
+
defValues["train.print.weights"] = (False, None)
|
| 104 |
+
defValues["valid.data.file"] = (None, None)
|
| 105 |
+
defValues["valid.accuracy.metric"] = (None, None)
|
| 106 |
+
defValues["predict.data.file"] = (None, None)
|
| 107 |
+
defValues["predict.use.saved.model"] = (True, None)
|
| 108 |
+
defValues["predict.output"] = ("binary", None)
|
| 109 |
+
defValues["predict.feat.pad.size"] = (60, None)
|
| 110 |
+
defValues["predict.print.output"] = (True, None)
|
| 111 |
+
defValues["calibrate.num.bins"] = (10, None)
|
| 112 |
+
defValues["calibrate.pred.prob.thresh"] = (0.5, None)
|
| 113 |
+
defValues["calibrate.num.nearest.neighbors"] = (10, None)
|
| 114 |
+
self.config = Configuration(configFile, defValues)
|
| 115 |
+
|
| 116 |
+
super(FeedForwardNetwork, self).__init__()
|
| 117 |
+
|
| 118 |
+
def setConfigParam(self, name, value):
|
| 119 |
+
"""
|
| 120 |
+
set config param
|
| 121 |
+
|
| 122 |
+
Parameters
|
| 123 |
+
name : config name
|
| 124 |
+
value : config value
|
| 125 |
+
"""
|
| 126 |
+
self.config.setParam(name, value)
|
| 127 |
+
|
| 128 |
+
def getConfig(self):
|
| 129 |
+
"""
|
| 130 |
+
get config object
|
| 131 |
+
"""
|
| 132 |
+
return self.config
|
| 133 |
+
|
| 134 |
+
def setVerbose(self, verbose):
|
| 135 |
+
self.verbose = verbose
|
| 136 |
+
|
| 137 |
+
def buildModel(self):
|
| 138 |
+
"""
|
| 139 |
+
Loads configuration and builds the various piecess necessary for the model
|
| 140 |
+
"""
|
| 141 |
+
torch.manual_seed(9999)
|
| 142 |
+
|
| 143 |
+
self.verbose = self.config.getBooleanConfig("common.verbose")[0]
|
| 144 |
+
numinp = self.config.getIntConfig("train.input.size")[0]
|
| 145 |
+
if numinp is None:
|
| 146 |
+
numinp = len(self.config.getIntListConfig("train.data.feature.fields")[0])
|
| 147 |
+
#numOut = len(self.config.getStringConfig("train.data.out.fields")[0].split(","))
|
| 148 |
+
self.outputSize = self.config.getIntConfig("train.output.size")[0]
|
| 149 |
+
self.batchSize = self.config.getIntConfig("train.batch.size")[0]
|
| 150 |
+
#lossRed = self.config.getStringConfig("train.loss.reduction")[0]
|
| 151 |
+
#learnRate = self.config.getFloatConfig("train.opt.learning.rate")[0]
|
| 152 |
+
self.numIter = self.config.getIntConfig("train.num.iterations")[0]
|
| 153 |
+
optimizer = self.config.getStringConfig("train.optimizer")[0]
|
| 154 |
+
self.lossFnStr = self.config.getStringConfig("train.lossFn")[0]
|
| 155 |
+
self.accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
|
| 156 |
+
self.trackErr = self.config.getBooleanConfig("train.track.error")[0]
|
| 157 |
+
self.batchIntv = self.config.getIntConfig("train.batch.intv")[0]
|
| 158 |
+
self.restored = False
|
| 159 |
+
self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None
|
| 160 |
+
|
| 161 |
+
#build network
|
| 162 |
+
layers = list()
|
| 163 |
+
ninp = numinp
|
| 164 |
+
trData = self.config.getStringConfig("train.layer.data")[0].split(",")
|
| 165 |
+
for ld in trData:
|
| 166 |
+
lde = ld.split(":")
|
| 167 |
+
assert len(lde) == 5, "expecting 5 items for layer data"
|
| 168 |
+
|
| 169 |
+
#num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction
|
| 170 |
+
nunit = int(lde[0])
|
| 171 |
+
actStr = lde[1]
|
| 172 |
+
act = FeedForwardNetwork.createActivation(actStr) if actStr != "none" else None
|
| 173 |
+
bnorm = lde[2] == "true"
|
| 174 |
+
afterAct = lde[3] == "true"
|
| 175 |
+
dpr = float(lde[4])
|
| 176 |
+
|
| 177 |
+
layers.append(torch.nn.Linear(ninp, nunit))
|
| 178 |
+
if bnorm:
|
| 179 |
+
#with batch norm
|
| 180 |
+
if afterAct:
|
| 181 |
+
safeAppend(layers, act)
|
| 182 |
+
layers.append(torch.nn.BatchNorm1d(nunit))
|
| 183 |
+
else:
|
| 184 |
+
layers.append(torch.nn.BatchNorm1d(nunit))
|
| 185 |
+
safeAppend(layers, act)
|
| 186 |
+
else:
|
| 187 |
+
#without batch norm
|
| 188 |
+
safeAppend(layers, act)
|
| 189 |
+
|
| 190 |
+
if dpr > 0:
|
| 191 |
+
layers.append(torch.nn.Dropout(dpr))
|
| 192 |
+
ninp = nunit
|
| 193 |
+
|
| 194 |
+
self.layers = torch.nn.Sequential(*layers)
|
| 195 |
+
|
| 196 |
+
self.device = FeedForwardNetwork.getDevice(self)
|
| 197 |
+
|
| 198 |
+
#training data
|
| 199 |
+
dataFile = self.config.getStringConfig("train.data.file")[0]
|
| 200 |
+
(featData, outData) = FeedForwardNetwork.prepData(self, dataFile)
|
| 201 |
+
self.featData = torch.from_numpy(featData)
|
| 202 |
+
self.outData = torch.from_numpy(outData)
|
| 203 |
+
|
| 204 |
+
#validation data
|
| 205 |
+
dataFile = self.config.getStringConfig("valid.data.file")[0]
|
| 206 |
+
(featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataFile)
|
| 207 |
+
self.validFeatData = torch.from_numpy(featDataV)
|
| 208 |
+
self.validOutData = torch.from_numpy(outDataV)
|
| 209 |
+
|
| 210 |
+
# loss function and optimizer
|
| 211 |
+
self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)
|
| 212 |
+
self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizer)
|
| 213 |
+
|
| 214 |
+
self.yPred = None
|
| 215 |
+
self.restored = False
|
| 216 |
+
|
| 217 |
+
#mode to device
|
| 218 |
+
self.device = FeedForwardNetwork.getDevice(self)
|
| 219 |
+
self.featData = self.featData.to(self.device)
|
| 220 |
+
self.outData = self.outData.to(self.device)
|
| 221 |
+
self.validFeatData = self.validFeatData.to(self.device)
|
| 222 |
+
self.to(self.device)
|
| 223 |
+
|
| 224 |
+
@staticmethod
|
| 225 |
+
def getDevice(model):
|
| 226 |
+
"""
|
| 227 |
+
gets device
|
| 228 |
+
|
| 229 |
+
Parameters
|
| 230 |
+
model : torch model
|
| 231 |
+
"""
|
| 232 |
+
devType = model.config.getStringConfig("common.device")[0]
|
| 233 |
+
if devType == "cuda":
|
| 234 |
+
if torch.cuda.is_available():
|
| 235 |
+
device = torch.device("cuda")
|
| 236 |
+
else:
|
| 237 |
+
exitWithMsg("cuda not available")
|
| 238 |
+
else:
|
| 239 |
+
device = torch.device("cpu")
|
| 240 |
+
return device
|
| 241 |
+
|
| 242 |
+
def setValidationData(self, dataSource, prep=True):
|
| 243 |
+
"""
|
| 244 |
+
sets validation data
|
| 245 |
+
|
| 246 |
+
Parameters
|
| 247 |
+
dataSource : data source str if file path or 2D array
|
| 248 |
+
prep : if True load and prepare
|
| 249 |
+
"""
|
| 250 |
+
if prep:
|
| 251 |
+
(featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataSource)
|
| 252 |
+
self.validFeatData = torch.from_numpy(featDataV)
|
| 253 |
+
self.validOutData = outDataV
|
| 254 |
+
else:
|
| 255 |
+
self.validFeatData = torch.from_numpy(dataSource[0])
|
| 256 |
+
self.validOutData = dataSource[1]
|
| 257 |
+
|
| 258 |
+
self.validFeatData = self.validFeatData.to(self.device)
|
| 259 |
+
|
| 260 |
+
@staticmethod
|
| 261 |
+
def createActivation(actName):
|
| 262 |
+
"""
|
| 263 |
+
create activation
|
| 264 |
+
|
| 265 |
+
Parameters
|
| 266 |
+
actName : activation name
|
| 267 |
+
"""
|
| 268 |
+
if actName is None:
|
| 269 |
+
activation = None
|
| 270 |
+
elif actName == "relu":
|
| 271 |
+
activation = torch.nn.ReLU()
|
| 272 |
+
elif actName == "tanh":
|
| 273 |
+
activation = torch.nn.Tanh()
|
| 274 |
+
elif actName == "sigmoid":
|
| 275 |
+
activation = torch.nn.Sigmoid()
|
| 276 |
+
elif actName == "softmax":
|
| 277 |
+
activation = torch.nn.Softmax(dim=1)
|
| 278 |
+
else:
|
| 279 |
+
exitWithMsg("invalid activation function name " + actName)
|
| 280 |
+
return activation
|
| 281 |
+
|
| 282 |
+
@staticmethod
|
| 283 |
+
def createLossFunction(model, lossFnName):
|
| 284 |
+
"""
|
| 285 |
+
create loss function
|
| 286 |
+
|
| 287 |
+
Parameters
|
| 288 |
+
lossFnName : loss function name
|
| 289 |
+
"""
|
| 290 |
+
config = model.config
|
| 291 |
+
lossRed = config.getStringConfig("train.loss.reduction")[0]
|
| 292 |
+
if lossFnName == "ltwo" or lossFnName == "mse":
|
| 293 |
+
lossFunc = torch.nn.MSELoss(reduction=lossRed)
|
| 294 |
+
elif lossFnName == "ce":
|
| 295 |
+
lossFunc = torch.nn.CrossEntropyLoss(reduction=lossRed)
|
| 296 |
+
elif lossFnName == "lone" or lossFnName == "mae":
|
| 297 |
+
lossFunc = torch.nn.L1Loss(reduction=lossRed)
|
| 298 |
+
elif lossFnName == "bce":
|
| 299 |
+
lossFunc = torch.nn.BCELoss(reduction=lossRed)
|
| 300 |
+
elif lossFnName == "bcel":
|
| 301 |
+
lossFunc = torch.nn.BCEWithLogitsLoss(reduction=lossRed)
|
| 302 |
+
elif lossFnName == "sm":
|
| 303 |
+
lossFunc = torch.nn.SoftMarginLoss(reduction=lossRed)
|
| 304 |
+
elif lossFnName == "mlsm":
|
| 305 |
+
lossFunc = torch.nn.MultiLabelSoftMarginLoss(reduction=lossRed)
|
| 306 |
+
else:
|
| 307 |
+
exitWithMsg("invalid loss function name " + lossFnName)
|
| 308 |
+
return lossFunc
|
| 309 |
+
|
| 310 |
+
@staticmethod
|
| 311 |
+
def createOptimizer(model, optName):
|
| 312 |
+
"""
|
| 313 |
+
create optimizer
|
| 314 |
+
|
| 315 |
+
Parameters
|
| 316 |
+
optName : optimizer name
|
| 317 |
+
"""
|
| 318 |
+
config = model.config
|
| 319 |
+
learnRate = config.getFloatConfig("train.opt.learning.rate")[0]
|
| 320 |
+
weightDecay = config.getFloatConfig("train.opt.weight.decay")[0]
|
| 321 |
+
momentum = config.getFloatConfig("train.opt.momentum")[0]
|
| 322 |
+
eps = config.getFloatConfig("train.opt.eps")[0]
|
| 323 |
+
if optName == "sgd":
|
| 324 |
+
dampening = config.getFloatConfig("train.opt.dampening")[0]
|
| 325 |
+
momentumNesterov = config.getBooleanConfig("train.opt.momentum.nesterov")[0]
|
| 326 |
+
optimizer = torch.optim.SGD(model.parameters(),lr=learnRate, momentum=momentum,
|
| 327 |
+
dampening=dampening, weight_decay=weightDecay, nesterov=momentumNesterov)
|
| 328 |
+
elif optName == "adam":
|
| 329 |
+
betas = config.getFloatListConfig("train.opt.betas")[0]
|
| 330 |
+
betas = (betas[0], betas[1])
|
| 331 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=learnRate,betas=betas, eps = eps,
|
| 332 |
+
weight_decay=weightDecay)
|
| 333 |
+
elif optName == "rmsprop":
|
| 334 |
+
alpha = config.getFloatConfig("train.opt.alpha")[0]
|
| 335 |
+
optimizer = torch.optim.RMSprop(model.parameters(), lr=learnRate, alpha=alpha,
|
| 336 |
+
eps=eps, weight_decay=weightDecay, momentum=momentum)
|
| 337 |
+
else:
|
| 338 |
+
exitWithMsg("invalid optimizer name " + optName)
|
| 339 |
+
return optimizer
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
def forward(self, x):
|
| 343 |
+
"""
|
| 344 |
+
In the forward function we accept a Tensor of input data and we must return
|
| 345 |
+
a Tensor of output data. We can use Modules defined in the constructor as
|
| 346 |
+
well as arbitrary (differentiable) operations on Tensors.
|
| 347 |
+
|
| 348 |
+
Parameters
|
| 349 |
+
x : data batch
|
| 350 |
+
"""
|
| 351 |
+
y = self.layers(x)
|
| 352 |
+
return y
|
| 353 |
+
|
| 354 |
+
@staticmethod
|
| 355 |
+
def addForwardHook(model, l, cl = 0):
|
| 356 |
+
"""
|
| 357 |
+
register forward hooks
|
| 358 |
+
|
| 359 |
+
Parameters
|
| 360 |
+
l :
|
| 361 |
+
cl :
|
| 362 |
+
"""
|
| 363 |
+
for name, layer in model._modules.items():
|
| 364 |
+
#If it is a sequential, don't register a hook on it
|
| 365 |
+
# but recursively register hook on all it's module children
|
| 366 |
+
print(str(cl) + " : " + name)
|
| 367 |
+
if isinstance(layer, torch.nn.Sequential):
|
| 368 |
+
FeedForwardNetwork.addForwardHook(layer, l, cl)
|
| 369 |
+
else:
|
| 370 |
+
# it's a non sequential. Register a hook
|
| 371 |
+
if cl == l:
|
| 372 |
+
print("setting hook at layer " + str(l))
|
| 373 |
+
layer.register_forward_hook(hookFn)
|
| 374 |
+
cl += 1
|
| 375 |
+
|
| 376 |
+
@staticmethod
|
| 377 |
+
def prepData(model, dataSource, includeOutFld=True):
|
| 378 |
+
"""
|
| 379 |
+
loads and prepares data
|
| 380 |
+
|
| 381 |
+
Parameters
|
| 382 |
+
dataSource : data source str if file path or 2D array
|
| 383 |
+
includeOutFld : True if target freld to be included
|
| 384 |
+
"""
|
| 385 |
+
# parameters
|
| 386 |
+
fieldIndices = model.config.getIntListConfig("train.data.fields")[0]
|
| 387 |
+
featFieldIndices = model.config.getIntListConfig("train.data.feature.fields")[0]
|
| 388 |
+
|
| 389 |
+
#all data and feature data
|
| 390 |
+
isDataFile = isinstance(dataSource, str)
|
| 391 |
+
selFieldIndices = fieldIndices if includeOutFld else fieldIndices[:-1]
|
| 392 |
+
if isDataFile:
|
| 393 |
+
#source file path
|
| 394 |
+
(data, featData) = loadDataFile(dataSource, ",", selFieldIndices, featFieldIndices)
|
| 395 |
+
else:
|
| 396 |
+
# tabular data
|
| 397 |
+
data = tableSelFieldsFilter(dataSource, selFieldIndices)
|
| 398 |
+
featData = tableSelFieldsFilter(data, featFieldIndices)
|
| 399 |
+
#print(featData)
|
| 400 |
+
featData = np.array(featData)
|
| 401 |
+
|
| 402 |
+
if (model.config.getStringConfig("common.preprocessing")[0] == "scale"):
|
| 403 |
+
scalingMethod = model.config.getStringConfig("common.scaling.method")[0]
|
| 404 |
+
|
| 405 |
+
#scale only if there are enough rows
|
| 406 |
+
nrow = featData.shape[0]
|
| 407 |
+
minrows = model.config.getIntConfig("common.scaling.minrows")[0]
|
| 408 |
+
if nrow > minrows:
|
| 409 |
+
#in place scaling
|
| 410 |
+
featData = scaleData(featData, scalingMethod)
|
| 411 |
+
else:
|
| 412 |
+
#use pre computes scaling parameters
|
| 413 |
+
spFile = model.config.getStringConfig("common.scaling.param.file")[0]
|
| 414 |
+
if spFile is None:
|
| 415 |
+
exitWithMsg("for small data sets pre computed scaling parameters need to provided")
|
| 416 |
+
scParams = restoreObject(spFile)
|
| 417 |
+
featData = scaleDataWithParams(featData, scalingMethod, scParams)
|
| 418 |
+
featData = np.array(featData)
|
| 419 |
+
|
| 420 |
+
# target data
|
| 421 |
+
if includeOutFld:
|
| 422 |
+
outFieldIndices = model.config.getStringConfig("train.data.out.fields")[0]
|
| 423 |
+
outFieldIndices = strToIntArray(outFieldIndices, ",")
|
| 424 |
+
if isDataFile:
|
| 425 |
+
outData = data[:,outFieldIndices]
|
| 426 |
+
else:
|
| 427 |
+
outData = tableSelFieldsFilter(data, outFieldIndices)
|
| 428 |
+
outData = np.array(outData)
|
| 429 |
+
foData = (featData.astype(np.float32), outData.astype(np.float32))
|
| 430 |
+
else:
|
| 431 |
+
foData = featData.astype(np.float32)
|
| 432 |
+
return foData
|
| 433 |
+
|
| 434 |
+
@staticmethod
|
| 435 |
+
def saveCheckpt(model):
|
| 436 |
+
"""
|
| 437 |
+
checkpoints model
|
| 438 |
+
|
| 439 |
+
Parameters
|
| 440 |
+
model : torch model
|
| 441 |
+
"""
|
| 442 |
+
print("..saving model checkpoint")
|
| 443 |
+
modelDirectory = model.config.getStringConfig("common.model.directory")[0]
|
| 444 |
+
assert os.path.exists(modelDirectory), "model save directory does not exist"
|
| 445 |
+
modelFile = model.config.getStringConfig("common.model.file")[0]
|
| 446 |
+
filepath = os.path.join(modelDirectory, modelFile)
|
| 447 |
+
state = {"state_dict": model.state_dict(), "optim_dict": model.optimizer.state_dict()}
|
| 448 |
+
torch.save(state, filepath)
|
| 449 |
+
if model.verbose:
|
| 450 |
+
print("model saved")
|
| 451 |
+
|
| 452 |
+
@staticmethod
|
| 453 |
+
def restoreCheckpt(model, loadOpt=False):
|
| 454 |
+
"""
|
| 455 |
+
restored checkpointed model
|
| 456 |
+
|
| 457 |
+
Parameters
|
| 458 |
+
model : torch model
|
| 459 |
+
loadOpt : True if optimizer to be loaded
|
| 460 |
+
"""
|
| 461 |
+
if not model.restored:
|
| 462 |
+
print("..restoring model checkpoint")
|
| 463 |
+
modelDirectory = model.config.getStringConfig("common.model.directory")[0]
|
| 464 |
+
modelFile = model.config.getStringConfig("common.model.file")[0]
|
| 465 |
+
filepath = os.path.join(modelDirectory, modelFile)
|
| 466 |
+
assert os.path.exists(filepath), "model save file does not exist"
|
| 467 |
+
checkpoint = torch.load(filepath)
|
| 468 |
+
model.load_state_dict(checkpoint["state_dict"])
|
| 469 |
+
model.to(model.device)
|
| 470 |
+
if loadOpt:
|
| 471 |
+
model.optimizer.load_state_dict(checkpoint["optim_dict"])
|
| 472 |
+
model.restored = True
|
| 473 |
+
|
| 474 |
+
@staticmethod
|
| 475 |
+
def processClassifOutput(yPred, config):
|
| 476 |
+
"""
|
| 477 |
+
extracts probability label 1 or label with highest probability
|
| 478 |
+
|
| 479 |
+
Parameters
|
| 480 |
+
yPred : predicted output
|
| 481 |
+
config : config object
|
| 482 |
+
"""
|
| 483 |
+
outType = config.getStringConfig("predict.output")[0]
|
| 484 |
+
if outType == "prob":
|
| 485 |
+
outputSize = config.getIntConfig("train.output.size")[0]
|
| 486 |
+
if outputSize == 2:
|
| 487 |
+
#return prob of pos class for binary classifier
|
| 488 |
+
yPred = yPred[:, 1]
|
| 489 |
+
else:
|
| 490 |
+
#return class value and probability for multi classifier
|
| 491 |
+
yCl = np.argmax(yPred, axis=1)
|
| 492 |
+
yPred = list(map(lambda y : y[0][y[1]], zip(yPred, yCl)))
|
| 493 |
+
yPred = zip(yCl, yPred)
|
| 494 |
+
else:
|
| 495 |
+
yPred = np.argmax(yPred, axis=1)
|
| 496 |
+
return yPred
|
| 497 |
+
|
| 498 |
+
@staticmethod
|
| 499 |
+
def printPrediction(yPred, config, dataSource):
|
| 500 |
+
"""
|
| 501 |
+
prints input feature data and prediction
|
| 502 |
+
|
| 503 |
+
Parameters
|
| 504 |
+
yPred : predicted output
|
| 505 |
+
config : config object
|
| 506 |
+
dataSource : data source str if file path or 2D array
|
| 507 |
+
"""
|
| 508 |
+
#prDataFilePath = config.getStringConfig("predict.data.file")[0]
|
| 509 |
+
padWidth = config.getIntConfig("predict.feat.pad.size")[0]
|
| 510 |
+
i = 0
|
| 511 |
+
if type(dataSource) == str:
|
| 512 |
+
for rec in fileRecGen(dataSource, ","):
|
| 513 |
+
feat = (",".join(rec)).ljust(padWidth, " ")
|
| 514 |
+
rec = feat + "\t" + str(yPred[i])
|
| 515 |
+
print(rec)
|
| 516 |
+
i += 1
|
| 517 |
+
else:
|
| 518 |
+
for rec in dataSource:
|
| 519 |
+
srec = toStrList(rec, 6)
|
| 520 |
+
feat = (",".join(srec)).ljust(padWidth, " ")
|
| 521 |
+
srec = feat + "\t" + str(yPred[i])
|
| 522 |
+
print(srec)
|
| 523 |
+
i += 1
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
@staticmethod
|
| 527 |
+
def allTrain(model):
|
| 528 |
+
"""
|
| 529 |
+
train with all data
|
| 530 |
+
|
| 531 |
+
Parameters
|
| 532 |
+
model : torch model
|
| 533 |
+
"""
|
| 534 |
+
# train mode
|
| 535 |
+
model.train()
|
| 536 |
+
for t in range(model.numIter):
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
# Forward pass: Compute predicted y by passing x to the model
|
| 540 |
+
yPred = model(model.featData)
|
| 541 |
+
|
| 542 |
+
# Compute and print loss
|
| 543 |
+
loss = model.lossFn(yPred, model.outData)
|
| 544 |
+
if model.verbose and t % 50 == 0:
|
| 545 |
+
print("epoch {} loss {:.6f}".format(t, loss.item()))
|
| 546 |
+
|
| 547 |
+
# Zero gradients, perform a backward pass, and update the weights.
|
| 548 |
+
model.optimizer.zero_grad()
|
| 549 |
+
loss.backward()
|
| 550 |
+
model.optimizer.step()
|
| 551 |
+
|
| 552 |
+
#validate
|
| 553 |
+
model.eval()
|
| 554 |
+
yPred = model(model.validFeatData)
|
| 555 |
+
yPred = yPred.data.cpu().numpy()
|
| 556 |
+
yActual = model.validOutData
|
| 557 |
+
if model.verbose:
|
| 558 |
+
result = np.concatenate((yPred, yActual), axis = 1)
|
| 559 |
+
print("predicted actual")
|
| 560 |
+
print(result)
|
| 561 |
+
|
| 562 |
+
score = perfMetric(model.accMetric, yActual, yPred)
|
| 563 |
+
print(formatFloat(3, score, "perf score"))
|
| 564 |
+
return score
|
| 565 |
+
|
| 566 |
+
@staticmethod
|
| 567 |
+
def batchTrain(model):
|
| 568 |
+
"""
|
| 569 |
+
train with batch data
|
| 570 |
+
|
| 571 |
+
Parameters
|
| 572 |
+
model : torch model
|
| 573 |
+
"""
|
| 574 |
+
model.restored = False
|
| 575 |
+
trainData = TensorDataset(model.featData, model.outData)
|
| 576 |
+
trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)
|
| 577 |
+
epochIntv = model.config.getIntConfig("train.epoch.intv")[0]
|
| 578 |
+
|
| 579 |
+
# train mode
|
| 580 |
+
model.train()
|
| 581 |
+
|
| 582 |
+
if model.trackErr:
|
| 583 |
+
trErr = list()
|
| 584 |
+
vaErr = list()
|
| 585 |
+
#epoch
|
| 586 |
+
for t in range(model.numIter):
|
| 587 |
+
#batch
|
| 588 |
+
b = 0
|
| 589 |
+
epochLoss = 0.0
|
| 590 |
+
for xBatch, yBatch in trainDataLoader:
|
| 591 |
+
|
| 592 |
+
# Forward pass: Compute predicted y by passing x to the model
|
| 593 |
+
xBatch, yBatch = xBatch.to(model.device), yBatch.to(model.device)
|
| 594 |
+
yPred = model(xBatch)
|
| 595 |
+
|
| 596 |
+
# Compute and print loss
|
| 597 |
+
loss = model.lossFn(yPred, yBatch)
|
| 598 |
+
if model.verbose and t % epochIntv == 0 and b % model.batchIntv == 0:
|
| 599 |
+
print("epoch {} batch {} loss {:.6f}".format(t, b, loss.item()))
|
| 600 |
+
|
| 601 |
+
if model.trackErr and model.batchIntv == 0:
|
| 602 |
+
epochLoss += loss.item()
|
| 603 |
+
|
| 604 |
+
#error tracking at batch level
|
| 605 |
+
if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:
|
| 606 |
+
trErr.append(loss.item())
|
| 607 |
+
vloss = FeedForwardNetwork.evaluateModel(model)
|
| 608 |
+
vaErr.append(vloss)
|
| 609 |
+
|
| 610 |
+
# Zero gradients, perform a backward pass, and update the weights.
|
| 611 |
+
model.optimizer.zero_grad()
|
| 612 |
+
loss.backward()
|
| 613 |
+
model.optimizer.step()
|
| 614 |
+
b += 1
|
| 615 |
+
|
| 616 |
+
#error tracking at epoch level
|
| 617 |
+
if model.trackErr and model.batchIntv == 0:
|
| 618 |
+
epochLoss /= len(trainDataLoader)
|
| 619 |
+
trErr.append(epochLoss)
|
| 620 |
+
vloss = FeedForwardNetwork.evaluateModel(model)
|
| 621 |
+
vaErr.append(vloss)
|
| 622 |
+
|
| 623 |
+
#validate
|
| 624 |
+
model.eval()
|
| 625 |
+
yPred = model(model.validFeatData)
|
| 626 |
+
yPred = yPred.data.cpu().numpy()
|
| 627 |
+
yActual = model.validOutData
|
| 628 |
+
if model.verbose:
|
| 629 |
+
vsize = yPred.shape[0]
|
| 630 |
+
print("\npredicted \t\t actual")
|
| 631 |
+
for i in range(vsize):
|
| 632 |
+
print(str(yPred[i]) + "\t" + str(yActual[i]))
|
| 633 |
+
|
| 634 |
+
score = perfMetric(model.accMetric, yActual, yPred)
|
| 635 |
+
print(yActual)
|
| 636 |
+
print(yPred)
|
| 637 |
+
print(formatFloat(3, score, "perf score"))
|
| 638 |
+
|
| 639 |
+
#save
|
| 640 |
+
modelSave = model.config.getBooleanConfig("train.model.save")[0]
|
| 641 |
+
if modelSave:
|
| 642 |
+
FeedForwardNetwork.saveCheckpt(model)
|
| 643 |
+
|
| 644 |
+
if model.trackErr:
|
| 645 |
+
FeedForwardNetwork.errorPlot(model, trErr, vaErr)
|
| 646 |
+
|
| 647 |
+
if model.config.getBooleanConfig("train.print.weights")[0]:
|
| 648 |
+
print("model weights")
|
| 649 |
+
for param in model.parameters():
|
| 650 |
+
print(param.data)
|
| 651 |
+
return score
|
| 652 |
+
|
| 653 |
+
@staticmethod
|
| 654 |
+
def errorPlot(model, trErr, vaErr):
|
| 655 |
+
"""
|
| 656 |
+
plot errors
|
| 657 |
+
|
| 658 |
+
Parameters
|
| 659 |
+
trErr : training error list
|
| 660 |
+
vaErr : validation error list
|
| 661 |
+
"""
|
| 662 |
+
x = np.arange(len(trErr))
|
| 663 |
+
plt.plot(x,trErr,label = "training error")
|
| 664 |
+
plt.plot(x,vaErr,label = "validation error")
|
| 665 |
+
plt.xlabel("iteration")
|
| 666 |
+
plt.ylabel("error")
|
| 667 |
+
plt.legend(["training error", "validation error"], loc='upper left')
|
| 668 |
+
plt.show()
|
| 669 |
+
|
| 670 |
+
@staticmethod
|
| 671 |
+
def modelPredict(model, dataSource = None):
|
| 672 |
+
"""
|
| 673 |
+
predict
|
| 674 |
+
|
| 675 |
+
Parameters
|
| 676 |
+
model : torch model
|
| 677 |
+
dataSource : data source
|
| 678 |
+
"""
|
| 679 |
+
#train or restore model
|
| 680 |
+
useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
|
| 681 |
+
if useSavedModel:
|
| 682 |
+
FeedForwardNetwork.restoreCheckpt(model)
|
| 683 |
+
else:
|
| 684 |
+
FeedForwardNetwork.batchTrain(model)
|
| 685 |
+
|
| 686 |
+
#predict
|
| 687 |
+
if dataSource is None:
|
| 688 |
+
dataSource = model.config.getStringConfig("predict.data.file")[0]
|
| 689 |
+
featData = FeedForwardNetwork.prepData(model, dataSource, False)
|
| 690 |
+
#print(featData)
|
| 691 |
+
featData = torch.from_numpy(featData)
|
| 692 |
+
featData = featData.to(model.device)
|
| 693 |
+
|
| 694 |
+
model.eval()
|
| 695 |
+
yPred = model(featData)
|
| 696 |
+
yPred = yPred.data.cpu().numpy()
|
| 697 |
+
#print(yPred)
|
| 698 |
+
|
| 699 |
+
if model.outputSize >= 2:
|
| 700 |
+
#classification
|
| 701 |
+
yPred = FeedForwardNetwork.processClassifOutput(yPred, model.config)
|
| 702 |
+
|
| 703 |
+
# print prediction
|
| 704 |
+
if model.config.getBooleanConfig("predict.print.output")[0]:
|
| 705 |
+
FeedForwardNetwork.printPrediction(yPred, model.config, dataSource)
|
| 706 |
+
|
| 707 |
+
return yPred
|
| 708 |
+
|
| 709 |
+
def predict(self, dataSource = None):
|
| 710 |
+
"""
|
| 711 |
+
predict
|
| 712 |
+
|
| 713 |
+
Parameters
|
| 714 |
+
dataSource : data source
|
| 715 |
+
"""
|
| 716 |
+
return FeedForwardNetwork.modelPredict(self, dataSource)
|
| 717 |
+
|
| 718 |
+
@staticmethod
|
| 719 |
+
def evaluateModel(model):
|
| 720 |
+
"""
|
| 721 |
+
evaluate model
|
| 722 |
+
|
| 723 |
+
Parameters
|
| 724 |
+
model : torch model
|
| 725 |
+
"""
|
| 726 |
+
model.eval()
|
| 727 |
+
with torch.no_grad():
|
| 728 |
+
yPred = model(model.validFeatData)
|
| 729 |
+
#yPred = yPred.data.cpu().numpy()
|
| 730 |
+
yActual = model.validOutData
|
| 731 |
+
score = model.lossFn(yPred, yActual).item()
|
| 732 |
+
model.train()
|
| 733 |
+
return score
|
| 734 |
+
|
| 735 |
+
@staticmethod
|
| 736 |
+
def prepValidate(model, dataSource=None):
|
| 737 |
+
"""
|
| 738 |
+
prepare for validation
|
| 739 |
+
|
| 740 |
+
Parameters
|
| 741 |
+
model : torch model
|
| 742 |
+
dataSource : data source
|
| 743 |
+
"""
|
| 744 |
+
#train or restore model
|
| 745 |
+
if not model.restored:
|
| 746 |
+
useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
|
| 747 |
+
if useSavedModel:
|
| 748 |
+
FeedForwardNetwork.restoreCheckpt(model)
|
| 749 |
+
else:
|
| 750 |
+
FeedForwardNetwork.batchTrain(model)
|
| 751 |
+
model.restored = True
|
| 752 |
+
|
| 753 |
+
if dataSource is not None:
|
| 754 |
+
model.setValidationData(dataSource)
|
| 755 |
+
|
| 756 |
+
@staticmethod
|
| 757 |
+
def validateModel(model, retPred=False):
|
| 758 |
+
"""
|
| 759 |
+
pmodel validation
|
| 760 |
+
|
| 761 |
+
Parameters
|
| 762 |
+
model : torch model
|
| 763 |
+
retPred : if True return prediction
|
| 764 |
+
"""
|
| 765 |
+
model.eval()
|
| 766 |
+
yPred = model(model.validFeatData)
|
| 767 |
+
yPred = yPred.data.cpu().numpy()
|
| 768 |
+
model.yPred = yPred
|
| 769 |
+
yActual = model.validOutData
|
| 770 |
+
vsize = yPred.shape[0]
|
| 771 |
+
if model.verbose:
|
| 772 |
+
print("\npredicted \t actual")
|
| 773 |
+
for i in range(vsize):
|
| 774 |
+
print("{:.3f}\t\t{:.3f}".format(yPred[i][0], yActual[i][0]))
|
| 775 |
+
|
| 776 |
+
score = perfMetric(model.accMetric, yActual, yPred)
|
| 777 |
+
print(formatFloat(3, score, "perf score"))
|
| 778 |
+
|
| 779 |
+
if retPred:
|
| 780 |
+
y = list(map(lambda i : (yPred[i][0], yActual[i][0]), range(vsize)))
|
| 781 |
+
res = (y, score)
|
| 782 |
+
return res
|
| 783 |
+
else:
|
| 784 |
+
return score
|
| 785 |
+
|
| 786 |
+
|
| 787 |
+
|
| 788 |
+
|
| 789 |
+
|