Source code for db.mldb

# Copyright (c) 2018 - 2019, Andreas Adelmann, Paul Scherrer Institut, Villigen PSI, Switzerland
#                            Nicole Neveu, SLAC National Accelerator Laboratory
# All rights reserved
#
# This file is part of pyOPALTools.
#
# pyOPALTools is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# You should have received a copy of the GNU General Public License
# along with pyOPALTools. If not, see <https://www.gnu.org/licenses/>.

import numpy as np
import sys
import os
import pandas as pd
from datetime import datetime
from bisect   import bisect_left
from opal import load_dataset, filetype

if sys.version_info[0] < 3:
  # Python 2
  import cPickle as pick
else:
  # Python 3
  import pickle as pick

from opal.parser.OptimizerParser import OptimizerParser
#
#from utilities import SDDSParser

[docs]def strToFloat(in_array): out_array=[] for in_row in in_array: if type(in_row) is str: a = in_row.split() out_row = [float(i) for i in a] else: out_row = [float(i) for i in in_row] out_array.append(out_row) return out_array
# From https://stackoverflow.com/questions/12141150/from-list-of-integers-get-number-closest-to-a-given-value
[docs]def findClosestIndex(myList, myNumber): """Returns closest index to myNumber. Assumes myList is sorted. If two numbers are equally close, return the smallest number. """ pos = bisect_left(myList, myNumber) if pos == 0: return 0 if pos == len(myList): return pos - 1 before = myList[pos - 1] after = myList[pos] if after - myNumber < myNumber - before: return pos else: return pos - 1
[docs]def substring_after(s, delim): return s.partition(delim)[2]
[docs]def substring_before(s, delim): return s.partition(delim)[0]
[docs]def checkBounds(data, keys): #nxs = number of x variables nxs = len(keys) #Print bounds for j in range(0, nxs): print('Now printing bounds of bad points:') print("max of "+ keys[j] + '= '+ str(max(data[0]['dvarValues'][:,j]))) print("min of "+ keys[j] + '= '+ str(min(data[0]['dvarValues'][:,j]))) print('\n')
[docs]def buildBounded(pickle, baseFN): #Build a data base using simulations within bounds given dbr = mldb() dbr.load(pickle) ulb = dbr.getBounds() keys = dbr.getXNames() n = len(keys) lb = np.zeros((1, n)) ub = np.zeros((1, n)) #Make array with upper bounds (ub) #and lower bounds (lb) for i, key in enumerate(keys): lb[0, i] = ulb[key][0] ub[0, i] = ulb[key][1] print(lb) print(ub) totalgen = dbr.getNumberOfSamples() bounded = [] unbounded = [] xvec = np.zeros((1, n)) bxvec = np.zeros((1, n)) objsNames = dbr.getYNames() nobjs = len(objsNames) yvec = np.zeros((1, len(objsNames))) byvec = np.zeros((1, len(objsNames))) #Loop through each generation for gen in range(0, totalgen): nsims = dbr.getSampleSize(i=gen) gxvec = np.zeros((1, n)) gyvec = np.zeros((1, len(objsNames))) #Save extra info if (gen == 0): bounded.append({'sampleSize':totalgen, 'dvarNames' :keys, 'objNames' :objsNames, 'bounds' :ulb}) #Loop through each simulation in gen for x in range(0, nsims): xvals = (dbr.getDVarVec(gen,x)).reshape((1,n)) ovals = (dbr.getObjVec(gen,x)).reshape((1,nobjs)) testlb = np.less_equal(xvals, lb) testub = np.greater_equal(xvals, ub) #Check if xvals <= lb or xvlas >= ub if (any(testlb[0]) == True) or (any(testub[0]) == True): bxvec = np.append(bxvec, xvals, axis=0) byvec = np.append(byvec, ovals, axis=0) #Check if xvlas within all bounds elif (all(testlb[0]) == False) and (all(testub[0]) == False): #print(testlb[0]) #print(testub[0]) #print(xvals) xvec = np.append(xvec, xvals, axis=0) yvec = np.append(yvec, ovals, axis=0) gxvec = np.append(gxvec, xvals, axis=0) gyvec = np.append(gyvec, ovals, axis=0) else: print('Mistake, xvals not in boundaries expected.') print('Don\'t trust the database.') gxvec = gxvec[1:,:] gyvec = gyvec[1:,:] #Saving good pts per generation bounded.append({'dvarValues':gxvec,'objValues' :gyvec}) print('generation # '+ str(gen+1) +', Number of sims:'+ str(nsims) + ', Number of bounded sims: ' + str(np.size(gxvec[:,0])), end='\r', flush=True) #Getting rid of place holders xvec = xvec[1:,:] yvec = yvec[1:,:] bxvec = bxvec[1:,:] byvec = byvec[1:,:] #Saving all data in one entry bounded.append({'allDvarValues':xvec, 'allObjValues':yvec}) #Saving all bad points, looses generation info unbounded.append({'dvarValues':bxvec, 'objValues' :byvec}) print('# bad pts:', str(np.size(bxvec[:,0])),', # good pts:', str(np.size(xvec[:,0]))) if (np.size(bxvec[:,0]) > 0): badbounds = checkBounds(unbounded, keys) filename = baseFN+'-bounded.pk' print('Write ML-Database ' + filename) pick.dump(bounded,open(filename,'wb'),-1)
[docs]class mldb:
[docs] def __init__(self, descr=''): print('OPAL ML Database Generator \x1b[6;30;42m' + descr + '\x1b[0m')
[docs] def build(self,filename_postfix, path): self.trainingSet = [] optjson = OptimizerParser() optjson.parse(path + '/') numGenerations = optjson.getNumOfGenerations() for i in range(numGenerations): g = optjson.readGeneration(i+1) if (i == 0): dvarsNames = optjson.getDesignVariables() objsNames = optjson.getObjectives() bounds = optjson.getBounds() self.trainingSet.append({'sampleSize':numGenerations, 'dvarNames' :dvarsNames, 'objNames' :objsNames, 'bounds' :bounds}) dvars = optjson.getAllInput() ovars = optjson.getAllOutput() self.trainingSet.append({'dvarValues':dvars, 'objValues' :ovars}) self.writeDB(filename_postfix)
[docs] def buildFromSampler(self, jsonFN, root, yNames, statBaseFn='', dbname=''): """Build training set from an OPAL sampler run """ ds = load_dataset(root, fname=jsonFN) if not statBaseFn: # really ulgy fix self.trainingSet = [] numGenerations = 1 dvarsNames = ds.design_variables objsNames = ds.objectives bounds = ds.bounds self.trainingSet.append({'sampleSize':numGenerations, 'dvarNames' :dvarsNames, 'objNames' :objsNames, 'bounds' :bounds}) nsamples = ds.size dvars = np.zeros((nsamples, len(dvarsNames))) ovars = np.zeros((nsamples, len(objsNames))) # find a better solution for i in range(nsamples): for j in range(len(dvarsNames)): dvars[i, j] = float(ds.getData(var=dvarsNames[j], ind=i)) for j in range(len(objsNames)): ovars[i, j] = float(ds.getData(var=objsNames[j], ind=i)) self.trainingSet.append({'dvarValues':dvars, 'objValues' :ovars}) if not dbname: dbname = 'sampler' self.writeDB(dbname) # return here! return self.trainingSet = [] x = [] y = [] fns = [] for ind in range(0,ds.size): statData=load_dataset(root, fname=str(ind)+'/'+statBaseFn+'.stat') fns.append(str(ind)+'/'+statBaseFn+'.stat') xstr = "" ystr = "" for dvar in ds.design_variables: xstr += dvar+"="+ds.getData(dvar,ind=ind)+" " x.append(xstr) for obj in yNames: ystr += str(statData.getData(obj)[-1])+" " y.append(ystr) lDataSets = len(x) xDim = len(x[0].split()) yDim = len(yNames) '''the following is a copy from buildFromSDDS ''' xNames = [] xValues = [] xall = x[0].split() for i in range(xDim): xNames.append(substring_before(xall[i], '=')) xValues.append(substring_after(xall[i], '=')) # design variables dvarsNames = xNames # object variables objsNames = yNames dvars = [] ovars = [] for i in range(lDataSets): xall = x[i].split() xi = [] for j in range(xDim): xi.append(substring_after(xall[j], '=')) dvars.append(xi) ovars.append(y[i]) numGenerations = 1 self.trainingSet.append({'sampleSize':numGenerations, 'dvarNames' :dvarsNames, 'objNames' :objsNames, 'dataFiles' :fns}) self.trainingSet.append({'dvarValues':dvars, 'objValues' :ovars}) dataFileName = 'test.huuuu' self.writeDB(statBaseFn)
[docs] def buildFromSDDS(self,baseFN, root, yNames): """Build training set from sdds (simulation) data Data obtained with for example OPAL The dataDescriptionFile will define the input (aka design) and output (aka object) variables """ self.trainingSet = [] p = SDDSParser() x = [] y = [] fns = [] cwd = os.getcwd() os.chdir(root) (x,y,fns) = p.collectStatFileData(baseFN, '.', yNames) os.chdir(cwd) lDataSets = len(x) xDim = len(x[0].split()) yDim = len(yNames) xNames = [] xValues = [] print('dim(x)= ', xDim, 'dim(y)= ', yDim, ' #datapoints= ',lDataSets) xall = x[0].split() for i in range(xDim): xNames.append(substring_before(xall[i], '=')) xValues.append(substring_after(xall[i], '=')) # design variables dvarsNames = xNames # object variables objsNames = yNames dvars = [] ovars = [] for i in range(lDataSets): xall = x[i].split() xi = [] for j in range(xDim): xi.append(substring_after(xall[j], '=')) yi = [] for j in range(len(yNames)): yi.append(y[i][j][-1]) dvars.append(xi) ovars.append(yi) #print(str(ovars[0])+'= f('+str(dvars[0])+')' ) # Add to training set # Treat everything as one generation numGenerations = 1 self.trainingSet.append({'sampleSize':numGenerations, 'dvarNames' :dvarsNames, 'objNames' :objsNames, 'dataFiles' :fns}) self.trainingSet.append({'dvarValues':dvars, 'objValues' :ovars}) dataFileName = 'test.huuuu' self.writeDB(baseFN)
[docs] def buildASCII(self,path,dataDescriptionFile,dataFileName,interlockFileName): """Build training set from archiver data obtained with the ArchiveExport command The dataDescriptionFile will define the input (aka design) and output (aka object) variables """ self.trainingSet = [] # Number of header lines nrHeaderLines = 4 # Filling number that replaces bad numbers # Lines with this number will be removed fillingNumber = 12356789 # read dataDescriptionFile dataDescription = np.genfromtxt(path + '/' + dataDescriptionFile, dtype=[('input','U1'),('name','|U30'),('unit','|S8')], comments='#') # design variables dvarsNames = [pv['name'] for pv in dataDescription if pv['input']=='x'] # object variables objsNames = [pv['name'] for pv in dataDescription if pv['input']=='y'] # read data (to be put in separate class) # match with header data file if sys.version_info[0] < 3: f = open(dataFileName) else: f = open(dataFileName, errors='ignore') for i in range(0,nrHeaderLines): header = f.readline() f.close() # Header has following format: '# Time name1 [unit1] name2 [unit2] etc. # We are interested in names only. # Problem: Unit can have spaces # Solution: skip first two words and those that have a bracket dataNames = [name for name in header.split()[2:] if not ('[' in name or ']' in name)] if not dataNames: print('no header found in data') return # get variables in data # add 2 to column number since first column is data and second time # zip(*) operation 'inverses' the lists dcols, dvarsNamesInData = zip(*[(index+2,name) for index,name in enumerate(dataNames) if name in dvarsNames]) ocols, objsNamesInData = zip(*[(index+2,name) for index,name in enumerate(dataNames) if name in objsNames]) # print variables not in data dvarsNamesNotInData = [name for name in dvarsNames if name not in dataNames] objsNamesNotInData = [name for name in objsNames if name not in dataNames] if dvarsNamesNotInData: print('WARNING: Design variables not in data', dvarsNamesNotInData) if objsNamesNotInData: print('WARNING: Objectives not in data', objsNamesNotInData) # Input data # Need to skip lines with "#N/A" and "<no data>" dvars = np.genfromtxt(dataFileName, usecols = dcols, skip_header = nrHeaderLines, comments = None, filling_values = fillingNumber) # Output data ovars = np.genfromtxt(dataFileName, usecols = ocols, skip_header = nrHeaderLines, comments = None, filling_values = fillingNumber) # Timing data str2date = lambda x: datetime.strptime(x.decode("utf-8"), '%m/%d/%Y') # Strip nanosecond info str2time = lambda x: datetime.strptime(x.decode("utf-8")[:-3], '%H:%M:%S.%f') timevars = np.genfromtxt(dataFileName, usecols = {0,1}, skip_header = nrHeaderLines, dtype = None, converters = {0:str2date, 1:str2time}) # Add date + time into single datetime tvars = np.array([datetime.combine(date.date(),time.time()) for (date,time) in zip(timevars[:,0],timevars[:,1])]) # remove rows with fillingNumber in dvars mask = ~(dvars==fillingNumber).any(1) if sum(~mask): print (sum(~mask), 'rows will be removed with non-complete design data') dvars = dvars[mask] ovars = ovars[mask] tvars = tvars[mask] # remove rows with fillingNumber in ovars mask = ~(ovars==fillingNumber).any(1) if sum(~mask): print (sum(~mask), 'rows will be removed with non-complete object data') dvars = dvars[mask] ovars = ovars[mask] tvars = tvars[mask] print (sum(mask), 'rows are left') # Interlock file if interlockFileName: # Procedure: # Add one or each interlock to objNames (proposal ILOCK-xxx) # Make zero vectors for each interlock # Use interlock timestamp to add 1's # Add vectors to objValues str2date = lambda x: datetime.strptime(x.decode("utf-8"), '%Y-%m-%d') # Now no nanosecond info str2time = lambda x: datetime.strptime(x.decode("utf-8"), '%H:%M:%S.%f') interlockvars = np.genfromtxt(interlockFileName, usecols = {0,1}, dtype = None, converters = {0:str2date, 1:str2time}) # Add date + time into single datetime interlocktime = [datetime.combine(date.date(),time.time()) for (date,time) in zip(interlockvars[:,0],interlockvars[:,1])] # For each interlock time find corresponding tvar time # We know that dates are sorted so we can use binary search interlocks = np.zeros(len(tvars)) for time in interlocktime: interlocks[findClosestIndex(tvars,time)] = 1 # If interlock file has larger range than data file first and last interlock might falsely be 1 # Reset to 0 interlocks[0] = 0 interlocks[-1] = 0 # Add to objectives objsNamesInData += ('ILOCK',) # Reallocating ovars, is there a more efficient way? ovars = np.column_stack((ovars,interlocks)); # Add to training set # Treat everything as one generation numGenerations = 1 self.trainingSet.append({'sampleSize':numGenerations, 'dvarNames' :dvarsNamesInData, 'objNames' :objsNamesInData}) self.trainingSet.append({'dvarValues':dvars, 'objValues' :ovars, 'times' :tvars}) self.writeDB(dataFileName)
[docs] def writeDB(self,filename_postfix): print('Write ML-Database ' + filename_postfix+'.pk') pick.dump(self.trainingSet,open(filename_postfix+'.pk','wb'),-1)
[docs] def load(self,filename): with open(filename, 'rb') as f: if sys.version_info[0] < 3: self.trainingSet = pick.load(f) else: self.trainingSet = pick.load(f,encoding='latin1')
[docs] def getSampleSize(self,i=0): return len(self.trainingSet[i+1]['dvarValues'])
[docs] def getNumberOfSamples(self): return self.trainingSet[0]['sampleSize']
[docs] def getXDim(self): return len(self.trainingSet[0]['dvarNames'])
[docs] def getXNames(self): return self.trainingSet[0]['dvarNames']
[docs] def getYDim(self): return len(self.trainingSet[0]['objNames'])
[docs] def getYNames(self): return self.trainingSet[0]['objNames']
[docs] def getAllDvar(self,gen): return self.trainingSet[gen+1]['dvarValues'][:]
[docs] def getAllObj(self,gen): return self.trainingSet[gen+1]['objValues'][:]
[docs] def getAllDvarData(self): data = self.getAllDvar(0) for i in range(1,self.getNumberOfSamples()): data = np.append(data, self.getAllDvar(i), axis=0) data = strToFloat(data) nQoIs = len(self.getXNames()) data = np.asarray(data) data = data[:,:nQoIs] return data
[docs] def getAllObjData(self): data = self.getAllObj(0) for i in range(1,self.getNumberOfSamples()): data = np.append(data, self.getAllObj(i), axis=0) data = strToFloat(data) nQoIs = len(self.getYNames()) data = np.asarray(data) data = data[:,:nQoIs] return data
[docs] def getDVarVec(self,gen,indiv): return self.trainingSet[gen+1]['dvarValues'][indiv]
[docs] def getObjVec(self,gen,indiv): return self.trainingSet[gen+1]['objValues'][indiv]
[docs] def getTimes(self,gen,indiv): return self.trainingSet[gen+1]['times'][indiv]
[docs] def getBounds(self): return self.trainingSet[0]['bounds']
[docs] def printOverview(self): if (self.trainingSet): # Show relevant information print ('xDim = ' + str(self.getXDim()) + ' -> ' + str(self.getXNames())) print ('yDim = ' + str(self.getYDim()) + ' -> ' + str(self.getYNames())) print ('generations = ' + str(self.getNumberOfSamples())) samples=0 for i in range(self.getNumberOfSamples()): s = self.getSampleSize(i) samples += s print('Data points = ' + str(samples)) # Example for a query: x = [] y = [] x = self.getDVarVec(0,0) y = self.getObjVec(0,0) print('Show first dataset from generation 0: y = f(x)') print (str(y)+ ' = f('+str(x)+')') else: print('Load data first') sys.exit()
[docs] def getQoIsPD(self): d=self.getAllObjData() x = pd.DataFrame(data=d, columns=self.getYNames()) return x
[docs] def getDvarsPD(self,ind=0): d=self.getAllDvarData() x = pd.DataFrame(data=d, columns=self.getXNames()) return x
[docs] def getQoiDvarPD(self, ind=0): generations = self.getNumberOfSamples() print (generations) data = pd.DataFrame(columns= self.getYNames() + self.getXNames()) for g in range(generations): for i in range(len(self.getAllObj(g))): if type(self.getAllObj(g)[0]) is str: a = [float(j) for j in self.getAllObj(g)[i].split()] else: a = [float(j) for j in self.getAllObj(g)[i]] b = [float(j) for j in self.getAllDvar(g)[i]] data.loc[i] = a + b return data
#def main(argv): # readAscii = False # read ASCII or JSON # filename_postfix = "results.json" # dataDescriptionFile = "dataDescr.txt" # interlockFile = None # path = "" # # for arg in argv: # if arg.startswith("--path"): # path = str.split(arg, "=")[1] # elif arg.startswith("--filename-postfix"): # filename_postfix = str.split(arg, "=")[1] # elif arg.startswith("--dataDescription"): # dataDescriptionFile = str.split(arg, "=")[1] # elif arg.startswith("--ascii"): # readAscii=True # elif arg.startswith("--interlock"): # interlockFile = str.split(arg, "=")[1] # # dbO = MlDb() # # # Generate and save MlDb from the OPAL simulation # if readAscii: # dbO.buildASCII(path, dataDescriptionFile, filename_postfix, interlockFile) # else: # # JSON # dbO.build(path,filename_postfix) # # # read the data from the MlDb # dbO.load(filename_postfix+'.pk') # # # # # # # iterate over all generations and individuals # # this can be used to create the neural net. # # # # x = [] # y = [] # for i in range(dbO.getNumberOfSamples()): # for j in range(dbO.getSampleSize(i)): # x = dbO.getDVarVec(i,j) # y = dbO.getObjVec(i,j) # # dbO.printOverview() #call main #if __name__ == "__main__": main(sys.argv[1:])