Bug in TDL seems to be fixed. Issue was incorrect calculation of next state reward (reuse of current state) in epsilon-greedy learner.

2012-11-28 19:45:03 -05:00
parent 214bdcd032
commit d24e7aee97
6 changed files with 130 additions and 101 deletions
--- a/src/net/woodyfolsom/msproj/ann/Connection.java
+++ b/src/net/woodyfolsom/msproj/ann/Connection.java
@@ -7,7 +7,7 @@ public class Connection {
 	private int src;
 	private int dest;
 	private double weight;
-	private transient double lastDelta = 0.0;
+	//private transient double lastDelta = 0.0;
 	private transient double trace = 0.0;
 	
 	public Connection() {
@@ -23,7 +23,7 @@ public class Connection {
 	public void addDelta(double delta) {
 		this.trace = delta;
 		this.weight += delta;
-		this.lastDelta = delta;
+		//this.lastDelta = delta;
 	}
 	
 	@XmlAttribute
@@ -31,10 +31,10 @@ public class Connection {
 		return dest;
 	}
 	
-	@XmlTransient
-	public double getLastDelta() {
-		return lastDelta;
-	}
+	//@XmlTransient
+	//public double getLastDelta() {
+	//	return lastDelta;
+	//}
 	
 	@XmlAttribute
 	public int getSrc() {
@@ -100,6 +100,6 @@ public class Connection {
 	
 	@Override
 	public String toString() {
-		return "Connection(" + src + ", " + dest +"), weight: " + weight;
+		return "Connection(src: " + src + ",dest: " + dest + ", trace:" + trace +"), weight: " + weight;
 	}
 }
--- a/src/net/woodyfolsom/msproj/ann/NNDataPair.java
+++ b/src/net/woodyfolsom/msproj/ann/NNDataPair.java
@@ -16,4 +16,9 @@ public class NNDataPair {
 	public NNData getIdeal() {
 		return ideal;
 	}
+	
+	@Override
+	public String toString() {
+		return input.toString() + " => " + ideal.toString();
+	}
 }
--- a/src/net/woodyfolsom/msproj/ann/TTTFilterTrainer.java
+++ b/src/net/woodyfolsom/msproj/ann/TTTFilterTrainer.java
@@ -1,8 +1,6 @@
 package net.woodyfolsom.msproj.ann;

-import java.io.File;
 import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
 import java.util.ArrayList;
 import java.util.List;

@@ -15,49 +13,74 @@ import net.woodyfolsom.msproj.tictactoe.Policy;
 import net.woodyfolsom.msproj.tictactoe.RandomPolicy;
 import net.woodyfolsom.msproj.tictactoe.State;

-public class TTTFilterTrainer { //implements epsilon-greedy trainer? online version of NeuralNetFilter
+public class TTTFilterTrainer { // implements epsilon-greedy trainer? online
+								// version of NeuralNetFilter

 	public static void main(String[] args) throws FileNotFoundException {
-		double alpha = 0.0;
-		double lambda = 0.9;
-		int maxGames = 15000;
+		double alpha = 0.15;
+		double lambda = .95;
+		int maxGames = 1000;

 		new TTTFilterTrainer().trainNetwork(alpha, lambda, maxGames);
 	}

-	public void trainNetwork(double alpha, double lambda, int maxGames) throws FileNotFoundException {		
-		///
-		FeedforwardNetwork neuralNetwork = new MultiLayerPerceptron(true, 9,5,1);
+	public void trainNetwork(double alpha, double lambda, int maxGames)
+			throws FileNotFoundException {
+
+		FeedforwardNetwork neuralNetwork = new MultiLayerPerceptron(true, 9, 6,
+				1);
 		neuralNetwork.setName("TicTacToe");
 		neuralNetwork.initWeights();
-		TrainingMethod trainer = new TemporalDifference(0.5,0.5);
+		TrainingMethod trainer = new TemporalDifference(alpha, lambda);

 		System.out.println("Playing untrained games.");
 		for (int i = 0; i < 10; i++) {
-			System.out.println("" + (i+1) + ". " + playOptimal(neuralNetwork).getResult());
+			System.out.println("" + (i + 1) + ". "
+					+ playOptimal(neuralNetwork).getResult());
 		}

-		System.out.println("Learning from " + maxGames + " games of random self-play");
+		System.out.println("Learning from " + maxGames
+				+ " games of random self-play");

 		int gamesPlayed = 0;
 		List<RESULT> results = new ArrayList<RESULT>();
 		do {
-			GameRecord gameRecord = playEpsilonGreedy(0.90, neuralNetwork, trainer);
+			GameRecord gameRecord = playEpsilonGreedy(0.90, neuralNetwork,
+					trainer);
 			System.out.println("Winner: " + gameRecord.getResult());
 			gamesPlayed++;
 			results.add(gameRecord.getResult());
 		} while (gamesPlayed < maxGames);
-		///

-		System.out.println("Learned network after " + maxGames + " training games.");
-
-		double[][] validationSet = new double[8][];
+		System.out.println("Learned network after " + maxGames
+				+ " training games.");

 		for (int i = 0; i < results.size(); i++) {
 			if (i % 10 == 0) {
-				System.out.println("" + (i+1) + ". " + results.get(i));
+				System.out.println("" + (i + 1) + ". " + results.get(i));
 			}
 		}
+		
+		evalTestCases(neuralNetwork);
+
+		System.out.println("Playing optimal games.");
+		for (int i = 0; i < 10; i++) {
+			System.out.println("" + (i + 1) + ". "
+					+ playOptimal(neuralNetwork).getResult());
+		}
+
+		/*
+		 * File output = new File("ttt.net");
+		 * 
+		 * FileOutputStream fos = new FileOutputStream(output);
+		 * 
+		 * neuralNetwork.save(fos);
+		 */
+	}
+
+	private void evalTestCases(FeedforwardNetwork neuralNetwork) {
+		double[][] validationSet = new double[8][];
+
 		// empty board
 		validationSet[0] = new double[] { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
 				0.0, 0.0 };
@@ -81,10 +104,8 @@ public class TTTFilterTrainer { //implements epsilon-greedy trainer? online vers
 				0.0, -1.0 };

 		// about to win
-		validationSet[7] = new double[] { 
-				-1.0, 1.0, 1.0, 
-				 1.0, -1.0, 1.0, 
-				-1.0, -1.0, 0.0 };
+		validationSet[7] = new double[] { -1.0, 1.0, 1.0, 1.0, -1.0, 1.0, -1.0,
+				-1.0, 0.0 };

 		String[] inputNames = new String[] { "00", "01", "02", "10", "11",
 				"12", "20", "21", "22" };
@@ -92,18 +113,6 @@ public class TTTFilterTrainer { //implements epsilon-greedy trainer? online vers

 		System.out.println("Output from eval set (learned network):");
 		testNetwork(neuralNetwork, validationSet, inputNames, outputNames);
-		
-		System.out.println("Playing optimal games.");
-		for (int i = 0; i < 10; i++) {
-			System.out.println("" + (i+1) + ". " + playOptimal(neuralNetwork).getResult());
-		}
-		
-		/*
-		File output = new File("ttt.net");
-		
-		FileOutputStream fos = new FileOutputStream(output);
-		
-		neuralNetwork.save(fos);*/
 	}
 	
 	private GameRecord playOptimal(FeedforwardNetwork neuralNetwork) {
@@ -113,30 +122,34 @@ public class TTTFilterTrainer { //implements epsilon-greedy trainer? online vers

 		State state = gameRecord.getState();

+		System.out.println("Playing optimal game:");
+		
 		do {
 			Action action;
 			State nextState;

-				action = neuralNetPolicy.getAction(gameRecord.getState());
+			action = neuralNetPolicy.getAction(gameRecord.getState());

-				nextState = gameRecord.apply(action);
-			//System.out.println("Action " + action + " selected by policy " + selectedPolicy.getName());
-			//System.out.println("Next board state: " + nextState);
+			nextState = gameRecord.apply(action);
+			System.out.println("Action " + action + " selected by policy " +
+					neuralNetPolicy.getName());
+			System.out.println("Next board state: " + nextState);
 			state = nextState;
 		} while (!state.isTerminal());

-		//finally, reinforce the actual reward
+		// finally, reinforce the actual reward

 		return gameRecord;
 	}

-	private GameRecord playEpsilonGreedy(double epsilon, FeedforwardNetwork neuralNetwork, TrainingMethod trainer) {
+	private GameRecord playEpsilonGreedy(double epsilon,
+			FeedforwardNetwork neuralNetwork, TrainingMethod trainer) {
 		GameRecord gameRecord = new GameRecord();

 		Policy randomPolicy = new RandomPolicy();
 		Policy neuralNetPolicy = new NeuralNetPolicy(neuralNetwork);

-		//System.out.println("Playing epsilon-greedy game.");
+		// System.out.println("Playing epsilon-greedy game.");

 		State state = gameRecord.getState();
 		NNDataPair statePair;
@@ -158,17 +171,20 @@ public class TTTFilterTrainer { //implements epsilon-greedy trainer? online vers

 				nextState = gameRecord.apply(action);
 				statePair = NNDataSetFactory.createDataPair(state);
-				NNDataPair nextStatePair = NNDataSetFactory.createDataPair(nextState);
-				trainer.iteratePattern(neuralNetwork, statePair, nextStatePair.getIdeal());
+				NNDataPair nextStatePair = NNDataSetFactory
+						.createDataPair(nextState);
+				trainer.iteratePattern(neuralNetwork, statePair,
+						nextStatePair.getIdeal());
 			}
-			//System.out.println("Action " + action + " selected by policy " + selectedPolicy.getName());
+			// System.out.println("Action " + action + " selected by policy " +
+			// selectedPolicy.getName());

-			//System.out.println("Next board state: " + nextState);
+			// System.out.println("Next board state: " + nextState);

 			state = nextState;
 		} while (!state.isTerminal());

-		//finally, reinforce the actual reward
+		// finally, reinforce the actual reward
 		statePair = NNDataSetFactory.createDataPair(state);
 		trainer.iteratePattern(neuralNetwork, statePair, statePair.getIdeal());

@@ -180,7 +196,7 @@ public class TTTFilterTrainer { //implements epsilon-greedy trainer? online vers
 		for (int valIndex = 0; valIndex < validationSet.length; valIndex++) {
 			NNDataPair dp = new NNDataPair(new NNData(inputNames,
 					validationSet[valIndex]), new NNData(outputNames,
-					validationSet[valIndex]));
+					new double[] {0.0}));
 			System.out.println(dp + " => " + neuralNetwork.compute(dp));
 		}
 	}
--- a/src/net/woodyfolsom/msproj/ann/TemporalDifference.java
+++ b/src/net/woodyfolsom/msproj/ann/TemporalDifference.java
@@ -83,7 +83,7 @@ public class TemporalDifference extends TrainingMethod {

 	private void updateWeights(FeedforwardNetwork neuralNetwork, double predictionError) {
 		for (Connection connection : neuralNetwork.getConnections()) {
-			Neuron srcNeuron = neuralNetwork.getNeuron(connection.getSrc());
+			/*Neuron srcNeuron = neuralNetwork.getNeuron(connection.getSrc());
 			Neuron destNeuron = neuralNetwork.getNeuron(connection.getDest());
 			
 			double delta = alpha * srcNeuron.getOutput()
@@ -91,6 +91,13 @@ public class TemporalDifference extends TrainingMethod {
 			
 			// TODO allow for momentum
 			// double lastDelta = connection.getLastDelta();
+			connection.addDelta(delta);*/
+			Neuron srcNeuron = neuralNetwork.getNeuron(connection.getSrc());
+			Neuron destNeuron = neuralNetwork.getNeuron(connection.getDest());
+			double delta = alpha * srcNeuron.getOutput() 
+					* destNeuron.getGradient() + connection.getTrace() * lambda;
+			//TODO allow for momentum
+			//double lastDelta = connection.getLastDelta();
 			connection.addDelta(delta); 
 		}
 	}
--- a/src/net/woodyfolsom/msproj/tictactoe/NeuralNetPolicy.java
+++ b/src/net/woodyfolsom/msproj/tictactoe/NeuralNetPolicy.java
@@ -24,7 +24,8 @@ public class NeuralNetPolicy extends Policy {
 		
 		for (Action action : validMoves) {
 			State nextState = state.apply(action);
-			NNDataPair dataPair = NNDataSetFactory.createDataPair(state);
+			//NNDataPair dataPair = NNDataSetFactory.createDataPair(state);
+			NNDataPair dataPair = NNDataSetFactory.createDataPair(nextState);
 			//estimated reward for X
 			scores.put(action, neuralNet.compute(dataPair).getValues()[0]);
 		}