Небольшая референса по созданию датасетов и нейронкам keras

Достался мне небольшой гавнопроект по нейронкам. Что-бы не терять времени бычтренько запилю тут основные моменты.

входные данные имеют примерно такой вид и структуру

ID, DateTime, Data1, Data2

Данные снимаются каждые пять минут, и в результате выходит файлы, в котором куча разных идшников с графиками двух параметров переменной длинны.
Ессно в таком виде данные на вход нейронки мы падать не можем. поэтому используем интерполяцию. я использую 10 точек для интерполяции и передачи времени от начала эксперемента средних значений для точки интерполяции параметра1 и минимальных, максимальных средних значений для параметра два для каждой точки интерполяции преобразуя рав дату в дата ассет вида. Вообще можно использовать не только минимальные, максимальные но и всякого рода дисперсии и отклонения в зависимости от задачи

TargetCalssNUM,
Dapa1_point1 .. Data1_point_10,
Data2_point1 .. Data2_Point10, 
Data2min_point1 .. Data2min_point10, 
Data2max_point1 .. Data2max_point10,
Time1 .. Time10

кусок гавнокода решающий задачу создания датасета из рав данных

import csv
import re
import datetime;
import sys;
import time;
from random import randint

def writeToFile(filename, fieldnames, records,interpolation_points):
    with open(filename, 'w', newline='') as csvfile:
       #fieldnames=['first_name', 'last_name']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()

        for record in records:
            row={'label': record[0]};   
            for i in range(0,interpolation_points):
                row.update({'p1_'+str(i): record[i+1]})
            for i in range(0, interpolation_points):
                row.update({'p2_'+str(i): record[i+1+interpolation_points]})
            for i in range(0, interpolation_points):
                row.update({'p2l_'+str(i): record[i+1+interpolation_points * 2]})
            for i in range(0, interpolation_points):
                row.update({'p2h_'+str(i): record[i+1+interpolation_points * 3]})
            for i in range(0, interpolation_points):
                row.update({'t1_'+str(i): record[i+1+interpolation_points * 4]})
        
            writer.writerow(row)

def writeTestDataFile(filename, fieldnames, records,interpolation_points):
    with open(filename, 'w', newline='') as csvfile:
       #fieldnames=['first_name', 'last_name']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()

        for record in records:
            row={'label': record[0]};   
            for i in range(0,interpolation_points):
                row.update({'p1_'+str(i): record[i+1]})
            for i in range(0, interpolation_points):
                row.update({'p2_'+str(i): record[i+1+interpolation_points]})
            for i in range(0, interpolation_points):
                row.update({'p2l_'+str(i): record[i+1+interpolation_points * 2]})
            for i in range(0, interpolation_points):
                row.update({'p2h_'+str(i): record[i+1+interpolation_points * 3]})
            for i in range(0, interpolation_points):
                row.update({'t1_'+str(i): record[i+1+interpolation_points * 4]})
        
            writer.writerow(row)

def num(s):
    try:
        return float(s)
    except ValueError:
        return float(0);

def zer(s):
    if(s==''):
        return '00';
    if(len(s)<2):
        return '0'+str(s)+''
    else:
        return s;

def calculate_times(param1, param2, times, intervals, sucess):
    endtime=times[len(times)-1]
    begintime=times[0]

    deltatime = endtime-begintime
    span = deltatime / intervals

    timeNum=0;

    p1list=[]
    p2list=[]
    p2l_list=[]
    p2h_list=[]
    t1list=[]

#    print("endtime: " + str(endtime))
#    print("begintime: " + str(begintime))
#    print("timedelta: " + str(deltatime))
#    print("divisordelta: " + str(deltatime/10))
#    print("start+delta: " + str((begintime + deltatime)/10 ))

    subcounter=0;
    currtime = begintime; 

    s1=0
    s2=0
    s2min=10005000
    s2max=-10005000
    for i in range(0, len(times)):
    
        if times[i] < currtime + span:
            s1+=param1[i]
            s2+=param2[i]

            if s2max < param2[i]:
                s2max = param2[i]

            if s2min > param2[i]:
                s2min = param2[i];

            subcounter=subcounter+1;
        else:
  
             
            p1list.append(s1/i)
            p2list.append(s2/i)
             
            p2l_list.append(s2min);
            p2h_list.append(s2max);
            
            s2min=10005000
            s2max=-10005000
            
            t1list.append((currtime + span / 2 - begintime ).total_seconds() / 60.0)
            subcounter=0;
            currtime = currtime + span;
   # print("p1list: " + str(p1list))
   # print("p2list: " + str(p2list))
   # print("t1list: " + str(t1list))
   # print("p2l_list: " + str(p2l_list))
   # print("p2h_list: "+ str(p2h_list))
   # print("delta: " + str(deltatime))

    outlist=[]
   
    if(sucess):
        outlist.append(1)
    else:
        outlist.append(0)

    outlist.extend(p1list)
    outlist.extend(p2list)
    outlist.extend(p2l_list)
    outlist.extend(p2h_list)
    outlist.extend(t1list)
#    print(outlist)
    return outlist

def readFromFile(rise, data1, data2, times, interpolation_points, sucess):

    final_items=[]
    with open(rise , newline='')as f:
        reader=csv.reader(f, delimiter=';', quoting=csv.QUOTE_NONE)
        seed_id=-1;
        new_seed=False; #for header
        line=0;   
        
        for row in reader:
    
            if len(row) <3:
                continue;
            if seed_id == -1:
                seed_id=row[0]
            elif seed_id != row[0]:
                seed_id=row[0]
                new_seed = True
            else:
                new_seed = False

            if new_seed:
                
                #print("data1 " + str(data1))
                #print("data2 " + str(data2))
#current seed ends
                if len(times)>0:
                    el=calculate_times(data1, data2, times, interpolation_points, sucess)
                    final_items.append(el)

                times=[]
                data1=[]
                data2=[]
                try:
                    date = re.search('(\d{4})\-(\d{2})\-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})', row[1])
#                   print("date " + date.group(1)+ " " +date.group(2)+" "+ date.group(3)+" "+date.group(4)+":"+date.group(5)+":" + "00");
                    maildatetime = datetime.datetime.strptime(date.group(3)+ " " +date.group(2)+" "+ date.group(1)+" "+date.group(4)+":"+date.group(5)+":" +date.group(6),"%d %m %Y %H:%M:%S")
                    times.append(maildatetime)
#                    print(maildatetime)
                except Exception as e:
                    print("something wrong with data " + str(e))
#new seed begins
                print("seed " + seed_id)
                if row[2].strip()=='':
                    data1.append(0);
                else:
                    data1.append(num(row[2].replace(',','.')))
                if row[3].strip()=='':
                    data2.append(0);
                else:
                    data2.append(num(row[3].replace(',','.')))
            else:
#seed body
                try:
                    date = re.search('(\d{4})\-(\d{2})\-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})', row[1])
#                   print("date " + date.group(1)+ " " +date.group(2)+" "+ date.group(3)+" "+date.group(4)+":"+date.group(5)+":" + "00");
                    maildatetime = datetime.datetime.strptime(date.group(3)+ " " +date.group(2)+" "+ date.group(1)+" "+date.group(4)+":"+date.group(5)+":" +date.group(6),"%d %m %Y %H:%M:%S")
                    times.append(maildatetime)
 #                   print(maildatetime)
                except Exception as e:
                    print("something wrong with data " + str(e))
                print("seed " + seed_id)
                if row[2].strip()=='':
                    data1.append(0);
                else:
                    data1.append(num(row[2].replace(',','.')))
                if row[3].strip()=='':
                    data2.append(0);
                else:
                    data2.append(num(row[3].replace(',','.')))

            line=line+1
#    print(count)
    return final_items;
                




sucess_data1=[]
sucess_data2=[]
sucess_times=[]  
failed_data1=[]
failed_data2=[]
failed_times=[]
interpolation_points=10;

test_amount=0.1

record_list=[]

records = readFromFile('sucess.csv', sucess_data1, sucess_data2, sucess_times, interpolation_points, True)
records2 = readFromFile('failed.csv', failed_data1, failed_data2, failed_times, interpolation_points,False)

records.extend(records2)

#from random import shuffle
#shuffle(records)

labels=[]
labels.append('label')
for i in range(0,interpolation_points):
    labels.append('p1_'+str(i));
for i in range(0,interpolation_points):
    labels.append('p2_'+str(i));
for i in range(0,interpolation_points):
    labels.append('p2l_'+str(i));
for i in range(0,interpolation_points):
    labels.append('p2h_'+str(i));
for i in range(0,interpolation_points):
    labels.append('t1_'+str(i))

test_records=[]

#for i in range(0, int(test_amount * len(records))):
#    int = randint(0, len(records))
#    z= records[i]
#    test_records.append(z)
#    records.remove(z)

writeToFile('train.csv', labels, records,interpolation_points)

Имея достаточное количество записей, конвертированных в датасет, с первым полем в качестве метки класса, можем приступать к обучению сети

from keras.utils import np_utils
from keras.layers.core import Dense, Activation, Dropout
import pandas as pd
import numpy as np
from keras.datasets import boston_housing
from keras.models import Sequential
from keras.utils.np_utils import to_categorical

import sys
import os
pbatch_size = int(sys.argv[1])
pepochs = int(sys.argv[2])

np.random.seed(42)

train = pd.read_csv('train.csv')
labels = train.ix[:,0].values.astype('int32')
x_train = (train.ix[:,1:].values).astype('float32')
y_train = np_utils.to_categorical(labels) 
nb_classes = y_train.shape[1]

mean = x_train.mean(axis=0)
std = x_train.std(axis=0)
x_train -= mean
x_train /= std

model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(x_train.shape[1],)))
model.add(Dense(128))
model.add(Dense(128))
model.add(Dense(nb_classes, activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="SGD", metrics=["accuracy"])

print(model.summary())

import os.path

model.fit(x_train,y_train, batch_size=pbatch_size, epochs=pepochs, verbose=1)

model.save_weights('seeds.hdf5')

pred = model.predict(x_train)
print (str(pred))

suc=0;
filed=0;
count=0;
i=0;
for row in pred:
    if row[0]> row[1]:
        v=0
    else :
        v=1

 #   label = int(str(test_labels[i])) #test
    label = int(str(labels[i])) #train

    if v== label:
        stat = "success"
        suc = suc+1;
    else :
        stat = "filed"
        filed = filed + 1;

    count=count+1;
    print("lerned data resulg")
    print(stat +" "+str(row) + ">>"+ str(v) + "==="+str(label))
    i=i+1
print("=====================================================================")
print("sucess rate " + str(suc/count))


import os
os.system("pause")

Дальнейшие действия сводятся к созданию датасетов из тестовых данных, аналогично обучаемым. и передачу их в сеть с загрузкой файла весов из .hdf5 файла

Добавить комментарий