Added script to pull historical stock data and resulting data files (1 per company). Added code to generate average price change per 1, 2 and 3-gram. Added code to output average price change per headline for VALIDATION dataset.

This commit is contained in:
Woody Folsom
2012-04-20 21:22:54 -04:00
parent eec32b19c1
commit 6e3680426e
65 changed files with 360216 additions and 102 deletions

6965
data/AAPL.txt Normal file

File diff suppressed because it is too large Load Diff

4286
data/ABC.txt Normal file

File diff suppressed because it is too large Load Diff

7319
data/ADM.txt Normal file

File diff suppressed because it is too large Load Diff

6964
data/AIG.txt Normal file

File diff suppressed because it is too large Load Diff

12662
data/BA.txt Normal file

File diff suppressed because it is too large Load Diff

6530
data/BAC.txt Normal file

File diff suppressed because it is too large Load Diff

6810
data/BBY.txt Normal file

File diff suppressed because it is too large Load Diff

0
data/BRKA.txt Normal file
View File

8906
data/C.txt Normal file

File diff suppressed because it is too large Load Diff

6126
data/CAH.txt Normal file

File diff suppressed because it is too large Load Diff

7643
data/COP.txt Normal file

File diff suppressed because it is too large Load Diff

6502
data/COST.txt Normal file

File diff suppressed because it is too large Load Diff

6888
data/CVS.txt Normal file

File diff suppressed because it is too large Load Diff

10675
data/CVX.txt Normal file

File diff suppressed because it is too large Load Diff

5968
data/DELL.txt Normal file

File diff suppressed because it is too large Load Diff

8905
data/DOW.txt Normal file

File diff suppressed because it is too large Load Diff

8906
data/F.txt Normal file

File diff suppressed because it is too large Load Diff

5886
data/FMCC.txt Normal file

File diff suppressed because it is too large Load Diff

8899
data/FNMA.txt Normal file

File diff suppressed because it is too large Load Diff

12662
data/GE.txt Normal file

File diff suppressed because it is too large Load Diff

355
data/GM.txt Normal file
View File

@@ -0,0 +1,355 @@
Date,Open,High,Low,Close,Volume,Adj_Close
2012-04-16,23.78,23.93,23.36,23.42,7528500,23.42
2012-04-13,24.22,24.29,23.62,23.80,7310900,23.80
2012-04-12,24.05,24.49,24.00,24.30,5041800,24.30
2012-04-11,23.84,24.28,23.82,24.03,9260900,24.03
2012-04-10,24.11,24.14,23.29,23.71,11856100,23.71
2012-04-09,24.22,24.40,24.03,24.20,7242200,24.20
2012-04-05,24.99,25.18,24.61,24.81,8211300,24.81
2012-04-04,25.10,25.40,24.90,25.10,9785500,25.10
2012-04-03,26.64,27.03,25.27,25.54,26410400,25.54
2012-04-02,26.03,26.91,25.98,26.76,14465300,26.76
2012-03-30,25.49,25.78,25.11,25.65,6035000,25.65
2012-03-29,25.02,25.37,24.94,25.31,6433300,25.31
2012-03-28,25.41,25.48,25.02,25.15,5162000,25.15
2012-03-27,25.42,25.85,25.34,25.35,6632500,25.35
2012-03-26,25.23,25.63,25.14,25.58,7957700,25.58
2012-03-23,25.04,25.32,24.77,25.17,5802800,25.17
2012-03-22,25.01,25.26,24.87,25.05,5605800,25.05
2012-03-21,25.17,25.49,25.10,25.29,6122800,25.29
2012-03-20,25.10,25.42,24.60,25.09,10819300,25.09
2012-03-19,25.50,25.70,25.24,25.39,5462700,25.39
2012-03-16,26.07,26.07,25.26,25.57,11752400,25.57
2012-03-15,26.37,26.58,25.88,26.05,10133700,26.05
2012-03-14,26.13,26.59,26.00,26.32,9082600,26.32
2012-03-13,25.48,26.10,25.33,26.07,8338500,26.07
2012-03-12,25.62,25.71,25.15,25.39,4006300,25.39
2012-03-09,25.70,25.82,25.39,25.62,6323600,25.62
2012-03-08,25.37,25.59,25.10,25.45,8646700,25.45
2012-03-07,24.75,25.21,24.75,24.88,6856400,24.88
2012-03-06,25.31,25.34,24.38,24.58,19071300,24.58
2012-03-05,26.27,26.40,25.80,26.00,7624700,26.00
2012-03-02,26.55,26.75,26.36,26.45,8442800,26.45
2012-03-01,26.19,26.80,26.15,26.47,8893300,26.47
2012-02-29,26.07,26.55,25.92,26.02,12018900,26.02
2012-02-28,26.49,26.53,26.10,26.14,7547000,26.14
2012-02-27,26.07,26.55,25.57,26.46,8600900,26.46
2012-02-24,26.90,26.95,26.06,26.07,9404600,26.07
2012-02-23,26.69,27.27,26.26,26.79,10300900,26.79
2012-02-22,27.08,27.13,26.53,26.55,10693300,26.55
2012-02-21,27.30,27.55,26.99,27.06,8593200,27.06
2012-02-17,27.17,27.68,27.01,27.34,17603600,27.34
2012-02-16,25.29,27.26,25.27,27.17,35313600,27.17
2012-02-15,25.73,25.75,24.90,24.93,13292000,24.93
2012-02-14,25.21,25.45,25.00,25.40,9885800,25.40
2012-02-13,26.00,26.00,25.21,25.34,10503600,25.34
2012-02-10,25.48,25.52,25.25,25.50,9819000,25.50
2012-02-09,25.96,26.22,25.50,25.74,7440000,25.74
2012-02-08,26.31,26.42,25.58,25.75,17392900,25.75
2012-02-07,26.62,26.62,26.15,26.22,10764100,26.22
2012-02-06,26.47,26.83,25.95,26.70,17264100,26.70
2012-02-03,25.00,26.44,24.79,26.18,25510600,26.18
2012-02-02,24.65,24.69,24.30,24.31,6670900,24.31
2012-02-01,24.33,24.58,24.07,24.37,13337500,24.37
2012-01-31,24.53,24.59,23.95,24.02,9190800,24.02
2012-01-30,24.06,24.57,23.95,24.23,6319800,24.23
2012-01-27,23.80,24.54,23.33,24.37,14784100,24.37
2012-01-26,25.10,25.50,24.60,24.72,11585300,24.72
2012-01-25,24.89,25.01,24.38,24.92,13441000,24.92
2012-01-24,24.84,24.93,24.50,24.79,8350000,24.79
2012-01-23,25.14,25.25,24.79,24.92,8923500,24.92
2012-01-20,24.93,25.11,24.84,25.00,9840500,25.00
2012-01-19,24.69,24.98,24.45,24.82,15222800,24.82
2012-01-18,24.27,24.58,24.02,24.51,8919200,24.51
2012-01-17,24.60,24.68,24.17,24.20,9385800,24.20
2012-01-13,24.27,24.65,23.91,24.29,12963100,24.29
2012-01-12,24.35,24.82,23.76,24.67,16750800,24.67
2012-01-11,23.37,24.64,23.34,24.47,21892300,24.47
2012-01-10,23.22,23.40,22.78,23.24,13538300,23.24
2012-01-09,23.20,23.43,22.70,22.84,12084500,22.84
2012-01-06,22.26,23.03,22.24,22.92,18234500,22.92
2012-01-05,21.10,22.29,20.96,22.17,17880600,22.17
2012-01-04,21.05,21.37,20.75,21.15,7856700,21.15
2012-01-03,20.83,21.18,20.75,21.05,9321300,21.05
2011-12-30,20.12,20.37,20.05,20.27,6971400,20.27
2011-12-29,19.85,20.25,19.71,20.21,6891400,20.21
2011-12-28,20.08,20.13,19.76,19.86,7116200,19.86
2011-12-27,20.43,20.43,20.08,20.09,5866200,20.09
2011-12-23,20.81,20.89,20.45,20.50,6480600,20.50
2011-12-22,20.25,20.85,20.17,20.70,7287200,20.70
2011-12-21,19.74,20.44,19.58,20.32,13054600,20.32
2011-12-20,19.42,19.90,19.35,19.69,13952500,19.69
2011-12-19,20.12,20.23,19.00,19.05,15608900,19.05
2011-12-16,20.16,20.52,19.97,20.15,9109900,20.15
2011-12-15,19.76,20.22,19.51,20.10,10152400,20.10
2011-12-14,19.95,20.01,19.42,19.47,15053700,19.47
2011-12-13,20.96,21.00,19.95,20.11,12647100,20.11
2011-12-12,20.66,20.90,20.56,20.80,6281600,20.80
2011-12-09,21.09,21.40,21.06,21.15,6491100,21.15
2011-12-08,21.76,21.76,20.85,20.98,11508500,20.98
2011-12-07,21.62,22.11,21.48,21.94,9194700,21.94
2011-12-06,21.60,21.88,21.28,21.68,10668900,21.68
2011-12-05,21.52,21.99,21.43,21.59,9808700,21.59
2011-12-02,21.35,21.73,21.13,21.28,9781200,21.28
2011-12-01,21.24,21.77,20.93,20.96,12128500,20.96
2011-11-30,21.02,21.31,20.94,21.29,11147700,21.29
2011-11-29,20.73,20.85,20.30,20.31,7665000,20.31
2011-11-28,20.95,21.43,20.65,20.74,8959100,20.74
2011-11-25,20.22,20.71,20.21,20.34,2757600,20.34
2011-11-23,20.48,20.69,20.06,20.24,9594500,20.24
2011-11-22,20.92,21.12,20.70,20.73,7288600,20.73
2011-11-21,21.10,21.18,20.54,21.05,11757900,21.05
2011-11-18,22.01,22.18,21.62,21.68,6592500,21.68
2011-11-17,22.72,22.75,21.56,21.79,12692700,21.79
2011-11-16,23.24,23.35,22.62,22.65,9297200,22.65
2011-11-15,22.81,23.53,22.60,23.35,13867200,23.35
2011-11-14,22.56,23.29,22.51,22.99,13433300,22.99
2011-11-11,22.95,23.10,22.22,22.51,15438300,22.51
2011-11-10,22.44,22.85,21.93,22.70,15866600,22.70
2011-11-09,23.07,23.57,22.15,22.31,32911600,22.31
2011-11-08,24.20,25.17,23.98,25.04,19891800,25.04
2011-11-07,23.93,24.15,23.52,24.01,9802800,24.01
2011-11-04,23.89,23.92,23.31,23.61,9452000,23.61
2011-11-03,23.82,24.12,22.76,24.03,19953000,24.03
2011-11-02,23.70,23.73,22.92,23.20,14368700,23.20
2011-11-01,24.82,24.90,23.25,23.33,25365200,23.33
2011-10-31,25.92,26.16,25.61,25.85,8853600,25.85
2011-10-28,26.22,26.55,26.00,26.45,10718700,26.45
2011-10-27,25.87,26.47,25.20,26.32,18428600,26.32
2011-10-26,25.02,25.28,24.69,24.99,17497500,24.99
2011-10-25,24.86,25.19,24.16,24.86,11384500,24.86
2011-10-24,24.28,25.24,24.25,24.98,13534100,24.98
2011-10-21,23.40,24.38,23.20,24.35,12352500,24.35
2011-10-20,23.02,23.18,22.51,22.96,9488400,22.96
2011-10-19,23.52,23.55,22.96,23.09,7600700,23.09
2011-10-18,23.19,23.87,22.77,23.54,14135700,23.54
2011-10-17,24.17,24.19,23.15,23.18,8633400,23.18
2011-10-14,23.68,24.16,23.36,24.16,14753200,24.16
2011-10-13,23.38,23.38,22.50,23.15,9858600,23.15
2011-10-12,22.94,23.97,22.86,23.41,17303900,23.41
2011-10-11,22.43,22.69,22.28,22.50,10258300,22.50
2011-10-10,22.51,22.97,22.36,22.62,11291700,22.62
2011-10-07,22.58,22.93,21.82,22.01,14476800,22.01
2011-10-06,22.30,22.60,21.75,22.35,13553000,22.35
2011-10-05,21.33,22.29,20.76,22.27,17155800,22.27
2011-10-04,19.45,21.46,19.05,21.42,23800000,21.42
2011-10-03,20.13,20.90,19.65,19.73,13950700,19.73
2011-09-30,20.44,20.50,20.10,20.18,10151100,20.18
2011-09-29,20.72,20.97,20.12,20.76,10781000,20.76
2011-09-28,21.20,21.44,20.37,20.41,10120900,20.41
2011-09-27,21.60,21.83,21.08,21.19,11170900,21.19
2011-09-26,21.32,21.44,20.53,21.08,8503500,21.08
2011-09-23,19.77,21.28,19.77,21.00,13722000,21.00
2011-09-22,20.59,20.99,20.04,20.24,17284200,20.24
2011-09-21,22.39,22.70,21.22,21.28,9850700,21.28
2011-09-20,23.05,23.10,22.42,22.43,9507100,22.43
2011-09-19,22.15,23.17,22.05,23.05,14082400,23.05
2011-09-16,22.68,22.77,22.34,22.61,7932400,22.61
2011-09-15,22.59,22.79,22.15,22.70,8039200,22.70
2011-09-14,22.22,22.49,21.62,22.18,11623100,22.18
2011-09-13,21.87,22.33,21.50,22.00,11189600,22.00
2011-09-12,21.15,21.95,21.00,21.87,9325700,21.87
2011-09-09,22.36,22.45,21.47,21.76,11920600,21.76
2011-09-08,22.79,23.13,22.24,22.48,11782500,22.48
2011-09-07,21.82,23.04,21.82,22.86,13412200,22.86
2011-09-06,21.36,21.58,20.88,21.44,14282500,21.44
2011-09-02,22.41,22.55,21.73,22.07,14086700,22.07
2011-09-01,24.09,24.25,22.91,23.03,16926200,23.03
2011-08-31,23.87,24.49,23.83,24.03,12329600,24.03
2011-08-30,23.46,23.74,23.05,23.58,8142900,23.58
2011-08-29,23.30,23.96,23.27,23.79,10642100,23.79
2011-08-26,22.20,23.08,21.80,22.87,10278000,22.87
2011-08-25,22.49,22.74,22.10,22.30,11099900,22.30
2011-08-24,21.89,22.39,21.57,22.37,14527400,22.37
2011-08-23,21.78,22.06,21.21,22.06,20007100,22.06
2011-08-22,22.54,22.72,21.18,21.71,27158700,21.71
2011-08-19,23.25,23.44,21.71,22.16,34432900,22.16
2011-08-18,23.99,24.08,23.27,23.60,15853700,23.60
2011-08-17,26.00,26.09,24.90,24.94,12062000,24.94
2011-08-16,26.22,26.69,25.69,25.83,10145500,25.83
2011-08-15,26.03,26.54,25.79,26.42,12994800,26.42
2011-08-12,26.30,26.50,25.49,25.75,18861500,25.75
2011-08-11,24.11,26.17,24.11,25.81,25179600,25.81
2011-08-10,24.66,25.00,23.83,23.92,20642400,23.92
2011-08-09,25.09,25.56,24.00,25.54,26704600,25.54
2011-08-08,24.61,25.35,23.79,24.57,32608700,24.57
2011-08-05,26.07,26.46,24.49,26.31,34926100,26.31
2011-08-04,27.16,27.20,25.71,25.99,38839500,25.99
2011-08-03,27.03,27.17,26.13,27.17,15790400,27.17
2011-08-02,27.75,28.09,27.02,27.05,18222000,27.05
2011-08-01,28.88,28.88,27.70,28.07,12825600,28.07
2011-07-29,27.62,28.10,27.31,27.68,12182900,27.68
2011-07-28,28.22,28.90,28.02,28.10,11029000,28.10
2011-07-27,28.90,29.00,28.03,28.14,14552900,28.14
2011-07-26,29.67,29.70,28.96,29.09,9268400,29.09
2011-07-25,29.53,29.84,29.39,29.50,8690700,29.50
2011-07-22,29.97,30.29,29.88,30.10,7820100,30.10
2011-07-21,29.39,30.02,29.16,29.96,12231600,29.96
2011-07-20,29.45,29.48,29.05,29.24,6997900,29.24
2011-07-19,29.20,29.49,28.76,29.33,10283400,29.33
2011-07-18,29.57,29.65,28.62,29.10,13850000,29.10
2011-07-15,30.39,30.62,29.52,29.76,10400000,29.76
2011-07-14,30.85,30.97,30.03,30.10,10800200,30.10
2011-07-13,30.87,31.30,30.61,30.75,8093200,30.75
2011-07-12,30.24,30.94,30.02,30.68,10536600,30.68
2011-07-11,30.98,31.20,30.55,30.75,8376600,30.75
2011-07-08,31.25,31.70,31.16,31.58,9872100,31.58
2011-07-07,31.79,32.08,31.65,31.80,13339400,31.80
2011-07-06,31.37,31.68,31.09,31.19,14223700,31.19
2011-07-05,30.87,31.36,30.58,30.86,8836900,30.86
2011-07-01,30.35,30.86,29.92,30.58,18098400,30.58
2011-06-30,30.30,30.56,30.22,30.36,18656100,30.36
2011-06-29,30.79,30.79,30.25,30.30,12716100,30.30
2011-06-28,30.21,30.79,30.18,30.50,12698300,30.50
2011-06-27,29.79,30.46,29.60,30.26,15348000,30.26
2011-06-24,30.15,30.30,29.66,29.92,50062300,29.92
2011-06-23,29.53,30.20,29.32,30.14,13780400,30.14
2011-06-22,29.62,30.18,29.50,29.97,15520300,29.97
2011-06-21,29.51,30.00,29.43,29.59,12648300,29.59
2011-06-20,28.87,29.60,28.77,29.52,9038000,29.52
2011-06-17,28.73,29.06,28.57,29.00,16732300,29.00
2011-06-16,28.70,28.99,28.17,28.59,14079400,28.59
2011-06-15,28.77,29.11,28.64,28.95,11669900,28.95
2011-06-14,28.92,29.49,28.86,29.11,10948400,29.11
2011-06-13,28.90,29.08,28.29,28.59,9791900,28.59
2011-06-10,29.30,29.30,28.65,28.85,11734000,28.85
2011-06-09,29.20,29.58,28.91,29.45,13593600,29.45
2011-06-08,28.52,29.34,28.40,28.86,16534000,28.86
2011-06-07,28.89,29.04,28.39,28.78,15237900,28.78
2011-06-06,29.02,29.41,28.55,28.56,13416100,28.56
2011-06-03,29.28,29.56,28.90,29.12,21968400,29.12
2011-06-02,30.33,30.56,29.40,29.60,22440900,29.60
2011-06-01,31.70,31.70,30.15,30.23,19134400,30.23
2011-05-31,31.44,31.87,31.13,31.81,22938900,31.81
2011-05-27,30.77,31.48,30.59,31.28,9114200,31.28
2011-05-26,31.08,31.10,30.52,30.68,12244400,30.68
2011-05-25,30.56,31.38,30.50,31.27,14390800,31.27
2011-05-24,31.13,31.20,30.50,30.83,8779100,30.83
2011-05-23,30.68,31.16,30.50,30.96,8971700,30.96
2011-05-20,31.36,31.50,31.10,31.18,7224100,31.18
2011-05-19,31.53,31.79,31.31,31.47,9575300,31.47
2011-05-18,31.07,31.62,31.00,31.52,9949400,31.52
2011-05-17,31.06,31.38,30.83,31.10,11191200,31.10
2011-05-16,31.25,31.50,31.08,31.10,7291700,31.10
2011-05-13,31.46,31.54,30.85,31.07,10837200,31.07
2011-05-12,31.07,31.60,30.93,31.42,14128200,31.42
2011-05-11,31.57,31.86,31.11,31.30,9073300,31.30
2011-05-10,31.47,31.64,31.33,31.61,7841900,31.61
2011-05-09,31.74,32.06,31.36,31.39,10716400,31.39
2011-05-06,32.50,32.60,31.84,31.91,12801800,31.91
2011-05-05,32.06,32.68,31.49,32.02,26623400,32.02
2011-05-04,33.16,33.47,32.71,33.04,20492000,33.04
2011-05-03,32.38,33.20,32.36,32.99,29894800,32.99
2011-05-02,32.41,32.50,31.92,32.18,11014500,32.18
2011-04-29,31.99,32.58,31.91,32.09,13774600,32.09
2011-04-28,31.76,32.10,31.48,31.91,15810100,31.91
2011-04-27,31.47,31.79,31.28,31.78,14945600,31.78
2011-04-26,31.39,31.51,30.96,31.27,15700000,31.27
2011-04-25,31.00,31.19,30.32,31.14,15442500,31.14
2011-04-21,30.05,31.00,30.01,30.95,18920800,30.95
2011-04-20,29.76,30.38,29.42,29.93,22038100,29.93
2011-04-19,29.81,29.91,29.17,29.59,19914800,29.59
2011-04-18,30.06,30.34,29.90,29.97,12745600,29.97
2011-04-15,30.59,30.72,30.18,30.24,9882800,30.24
2011-04-14,30.65,30.86,30.35,30.58,9044800,30.58
2011-04-13,31.23,31.32,30.59,30.86,13781100,30.86
2011-04-12,30.40,31.34,30.10,31.15,19648600,31.15
2011-04-11,31.34,31.45,30.55,30.77,15179100,30.77
2011-04-08,32.40,32.75,31.33,31.52,16057700,31.52
2011-04-07,32.84,32.84,32.07,32.31,11242400,32.31
2011-04-06,33.00,33.28,32.52,32.87,8160100,32.87
2011-04-05,32.32,32.87,32.10,32.87,10106900,32.87
2011-04-04,32.50,32.72,32.20,32.39,12331400,32.39
2011-04-01,31.39,32.63,30.84,32.41,29883100,32.41
2011-03-31,31.40,31.55,31.00,31.03,8976600,31.03
2011-03-30,31.16,31.64,31.04,31.55,7657000,31.55
2011-03-29,30.93,31.17,30.68,31.10,9756700,31.10
2011-03-28,31.58,31.58,30.85,30.85,10302600,30.85
2011-03-25,31.49,31.70,31.09,31.47,15242300,31.47
2011-03-24,31.32,31.60,31.24,31.39,15743100,31.39
2011-03-23,30.60,31.28,30.20,31.16,23016000,31.16
2011-03-22,31.28,31.35,30.51,30.74,16982100,30.74
2011-03-21,32.24,32.30,31.23,31.28,13196500,31.28
2011-03-18,31.74,31.95,31.42,31.85,9861200,31.85
2011-03-17,32.18,32.39,31.33,31.44,11025600,31.44
2011-03-16,32.42,32.53,31.40,31.78,14148900,31.78
2011-03-15,30.98,32.49,30.65,32.35,21471300,32.35
2011-03-14,32.14,32.30,31.43,31.59,9976500,31.59
2011-03-11,31.25,32.06,31.24,31.93,14024000,31.93
2011-03-10,31.47,31.78,30.95,31.42,38333600,31.42
2011-03-09,32.74,32.76,32.10,32.25,11225600,32.25
2011-03-08,31.74,32.82,31.69,32.72,14509400,32.72
2011-03-07,32.35,32.50,31.52,31.70,15077100,31.70
2011-03-04,33.07,33.08,32.01,32.39,24240900,32.39
2011-03-03,33.03,33.17,32.65,33.03,17614800,33.03
2011-03-02,32.90,33.17,32.59,32.88,14306700,32.88
2011-03-01,33.69,33.75,32.43,32.95,27321600,32.95
2011-02-28,33.49,33.74,32.86,33.53,15886400,33.53
2011-02-25,33.67,34.20,33.05,33.25,29017800,33.25
2011-02-24,34.90,35.00,32.05,33.02,63562800,33.02
2011-02-23,35.85,35.94,33.80,34.59,28192500,34.59
2011-02-22,35.86,36.15,35.45,35.77,13937400,35.77
2011-02-18,36.42,36.76,36.38,36.51,6815800,36.51
2011-02-17,36.55,36.70,36.30,36.37,7463000,36.37
2011-02-16,36.11,36.84,36.02,36.75,8680900,36.75
2011-02-15,36.19,36.41,35.80,36.11,10336500,36.11
2011-02-14,36.55,36.56,35.47,36.29,7121200,36.29
2011-02-11,35.76,36.57,35.55,36.45,13508300,36.45
2011-02-10,36.17,36.64,35.52,35.88,11542400,35.88
2011-02-09,36.82,36.91,36.28,36.41,6399300,36.41
2011-02-08,36.97,37.05,36.40,36.89,6243000,36.89
2011-02-07,36.95,37.09,36.61,36.70,7328800,36.70
2011-02-04,36.25,36.73,35.89,36.59,11115000,36.59
2011-02-03,35.97,36.06,35.13,36.06,19350000,36.06
2011-02-02,36.46,36.56,35.58,35.68,17434700,35.68
2011-02-01,36.93,37.23,36.13,36.45,28091600,36.45
2011-01-31,36.89,37.05,35.89,36.49,13954800,36.49
2011-01-28,38.00,38.02,36.01,36.60,37134600,36.60
2011-01-27,38.20,38.95,38.03,38.67,13950000,38.67
2011-01-26,38.75,38.91,37.86,37.89,11534600,37.89
2011-01-25,37.85,38.49,37.71,38.40,12318700,38.40
2011-01-24,37.71,37.86,37.03,37.64,12202800,37.64
2011-01-21,37.33,37.85,36.82,37.24,9548500,37.24
2011-01-20,37.11,37.29,36.27,37.18,15844900,37.18
2011-01-19,37.78,37.93,37.26,37.40,12667700,37.40
2011-01-18,38.05,38.33,37.32,38.03,8918200,38.03
2011-01-14,38.18,38.47,38.04,38.20,5894200,38.20
2011-01-13,38.66,38.71,38.11,38.27,11358300,38.27
2011-01-12,38.95,39.37,38.37,38.62,16773900,38.62
2011-01-11,38.66,39.43,38.51,38.75,14856500,38.75
2011-01-10,39.34,39.36,38.44,38.56,18341600,38.56
2011-01-07,38.84,39.33,38.51,38.98,19901100,38.98
2011-01-06,38.24,39.48,38.07,38.90,38556900,38.90
2011-01-05,37.47,38.30,37.47,38.07,22503900,38.07
2011-01-04,37.10,37.99,36.68,37.90,32363400,37.90
2011-01-03,37.32,38.00,37.03,37.06,24874900,37.06
2010-12-31,36.84,36.96,36.57,36.86,6163900,36.86
2010-12-30,36.10,36.98,36.02,36.82,16980800,36.82
2010-12-29,35.47,36.30,35.25,36.02,20960800,36.02
2010-12-28,35.38,35.67,35.07,35.32,23489000,35.32
2010-12-27,34.41,34.89,34.19,34.60,7368300,34.60
2010-12-23,34.67,35.52,34.62,34.81,20529200,34.81
2010-12-22,33.72,34.95,33.53,34.92,20935100,34.92
2010-12-21,33.86,33.94,33.72,33.85,9012400,33.85
2010-12-20,33.91,34.05,33.74,33.76,12476400,33.76
2010-12-17,33.53,34.00,33.19,34.00,35681600,34.00
2010-12-16,33.57,33.86,33.56,33.61,9885700,33.61
2010-12-15,33.81,34.01,33.61,33.61,10182700,33.61
2010-12-14,33.73,33.92,33.45,33.89,15165600,33.89
2010-12-13,33.96,34.05,33.70,33.80,11029300,33.80
2010-12-10,33.85,33.99,33.53,33.81,11741700,33.81
2010-12-09,34.36,34.43,33.62,33.74,18402200,33.74
2010-12-08,34.61,34.73,34.33,34.45,12603900,34.45
2010-12-07,34.75,34.89,34.46,34.68,20823000,34.68
2010-12-06,34.48,34.78,34.41,34.48,11676500,34.48
2010-12-03,34.55,34.60,33.97,34.55,19395200,34.55
2010-12-02,34.92,34.98,34.51,34.68,23196100,34.68
2010-12-01,34.65,34.95,34.42,34.78,34633200,34.78
2010-11-30,33.53,34.25,33.36,34.20,57476900,34.20
2010-11-29,33.80,33.81,33.07,33.80,27776900,33.80
2010-11-26,33.41,33.81,33.21,33.80,12301200,33.80
2010-11-24,33.73,33.80,33.22,33.48,26138000,33.48
2010-11-23,33.95,33.99,33.19,33.25,31170200,33.25
2010-11-22,34.20,34.48,33.81,34.08,36650600,34.08
2010-11-19,34.15,34.50,33.11,34.26,107842000,34.26
2010-11-18,35.00,35.99,33.89,34.19,457044300,34.19

6977
data/HD.txt Normal file

File diff suppressed because it is too large Load Diff

12662
data/HPQ.txt Normal file

File diff suppressed because it is too large Load Diff

12662
data/IBM.txt Normal file

File diff suppressed because it is too large Load Diff

10675
data/JNJ.txt Normal file

File diff suppressed because it is too large Load Diff

7138
data/JPM.txt Normal file

File diff suppressed because it is too large Load Diff

2726
data/KFT.txt Normal file

File diff suppressed because it is too large Load Diff

8906
data/KR.txt Normal file

File diff suppressed because it is too large Load Diff

6759
data/LOW.txt Normal file

File diff suppressed because it is too large Load Diff

4383
data/MCK.txt Normal file

File diff suppressed because it is too large Load Diff

2969
data/MET.txt Normal file

File diff suppressed because it is too large Load Diff

2181
data/MHS.txt Normal file

File diff suppressed because it is too large Load Diff

10669
data/MRO.txt Normal file

File diff suppressed because it is too large Load Diff

6583
data/MSFT.txt Normal file

File diff suppressed because it is too large Load Diff

8905
data/PEP.txt Normal file

File diff suppressed because it is too large Load Diff

7643
data/PFE.txt Normal file

File diff suppressed because it is too large Load Diff

10675
data/PG.txt Normal file

File diff suppressed because it is too large Load Diff

1503
data/SNPAX.txt Normal file

File diff suppressed because it is too large Load Diff

6999
data/T.txt Normal file

File diff suppressed because it is too large Load Diff

7325
data/TGT.txt Normal file

File diff suppressed because it is too large Load Diff

5562
data/UNH.txt Normal file

File diff suppressed because it is too large Load Diff

3122
data/UPS.txt Normal file

File diff suppressed because it is too large Load Diff

10675
data/UTX.txt Normal file

File diff suppressed because it is too large Load Diff

7636
data/VLO.txt Normal file

File diff suppressed because it is too large Load Diff

7165
data/VZ.txt Normal file

File diff suppressed because it is too large Load Diff

6759
data/WAG.txt Normal file

File diff suppressed because it is too large Load Diff

6925
data/WFC.txt Normal file

File diff suppressed because it is too large Load Diff

2635
data/WLP.txt Normal file

File diff suppressed because it is too large Load Diff

10003
data/WMT.txt Normal file

File diff suppressed because it is too large Load Diff

10675
data/XOM.txt Normal file

File diff suppressed because it is too large Load Diff

68
data/getStockHistory Normal file
View File

@@ -0,0 +1,68 @@
#!/bin/bash
# NAME
# getStockHistory -- Downloads historic stock prices for a given company stock.
#
# SYNOPSIS
# getStockHistory symbol [optional saveLocation]
# getStockHistory -f file [optional saveLocation]
#
# DESCRIPTION
# The getSockHistory command attempts to download complete stock history for a given company or
# for list of companies in a file.
#
# OPTIONS
# -f Use a file to have a list of stock symbols to download. Each stock symbol should be on a
# differnt line in the file.
#
#
# EXAMPLES
# The following downloads Apple's stock history.
# getStockHistory AAPL
#
# BUGS
#
# Written by: Rowland O'Flaherty (rowland.oflaherty@gmail.com)
# Created on: 02/08/10/
function getHistory() {
stockSymbol=$1
if [[ -z $stockSymbol ]]
then
echo "Invalid stock symbol"
exit 1
fi
stockSymbol=$(echo $stockSymbol | tr '[:lower:]' '[:upper:]')
saveLocation=$2
wget -q -O ${stockSymbol}.csv "http://ichart.finance.yahoo.com/table.csv?s=${stockSymbol}&ignore=.csv"
sed '1s/\ /_/g' ${stockSymbol}.csv > ${stockSymbol}.tmp
mv ${stockSymbol}.tmp ${stockSymbol}.csv
mv ${stockSymbol}.csv ${saveLocation}
}
fOpition=false
while getopts :f OPTION
do
case $OPTION in
f)
fOpition=true
;;
'?')
echo "Invalid option $OPTARG"
;;
esac
done
if [[ "$fOpition" = "true" ]]
then
fileLocation=$2
saveLocation=$3
saveLocation=${saveLocation:=$HOME/Downloads}
for aSym in $(cat $fileLocation)
do
eval "getHistory $aSym $saveLocation"
done
else
saveLocation=$2
saveLocation=${saveLocation:=$HOME/Downloads}
eval "getHistory $1 $saveLocation"
fi

50
data/stock_symbols.csv Normal file
View File

@@ -0,0 +1,50 @@
1,Wal-Mart Stores,WMT
2,Exxon Mobil,XOM
3,Chevron,CVX
4,ConocoPhillips,COP
5,Fannie Mae,FNMA
6,General Electric,GE
7,Berkshire Hathaway,BRKA
8,General Motors,GM
9,Bank of America Corp.,BAC
10,Ford Motor,F
11,Hewlett-Packard,HPQ
12,AT&T,T
13,J.P. Morgan Chase & Co.,JPM
14,Citigroup,C
15,McKesson,MCK
16,Verizon Communications,VZ
17,American International Group,AIG
18,International Business Machines,IBM
19,Cardinal Health,CAH
20,Freddie Mac,FMCC
21,CVS Caremark,CVS
22,UnitedHealth Group,UNH
23,Wells Fargo,WFC
24,Valero Energy,VLO
25,Kroger,KR
26,Procter & Gamble,PG
27,AmerisourceBergen,ABC
28,Costco Wholesale,COST
29,Marathon Oil,MRO
30,Home Depot,HD
31,Pfizer,PFE
32,Walgreen,WAG
33,Target,TGT
34,Medco Health Solutions,MHS
35,Apple,AAPL
36,Boeing,BA
37,State Farm Insurance Cos.,SNPAX
38,Microsoft,MSFT
39,Archer Daniels Midland,ADM
40,Johnson & Johnson,JNJ
41,Dell,DELL
42,WellPoint,WLP
43,PepsiCo,PEP
44,United Technologies,UTX
45,Dow Chemical,DOW
46,MetLife,MET
47,Best Buy,BBY
48,United Parcel Service,UPS
49,Kraft Foods,KFT
50,Lowe's,LOW
1 1 Wal-Mart Stores WMT
2 2 Exxon Mobil XOM
3 3 Chevron CVX
4 4 ConocoPhillips COP
5 5 Fannie Mae FNMA
6 6 General Electric GE
7 7 Berkshire Hathaway BRKA
8 8 General Motors GM
9 9 Bank of America Corp. BAC
10 10 Ford Motor F
11 11 Hewlett-Packard HPQ
12 12 AT&T T
13 13 J.P. Morgan Chase & Co. JPM
14 14 Citigroup C
15 15 McKesson MCK
16 16 Verizon Communications VZ
17 17 American International Group AIG
18 18 International Business Machines IBM
19 19 Cardinal Health CAH
20 20 Freddie Mac FMCC
21 21 CVS Caremark CVS
22 22 UnitedHealth Group UNH
23 23 Wells Fargo WFC
24 24 Valero Energy VLO
25 25 Kroger KR
26 26 Procter & Gamble PG
27 27 AmerisourceBergen ABC
28 28 Costco Wholesale COST
29 29 Marathon Oil MRO
30 30 Home Depot HD
31 31 Pfizer PFE
32 32 Walgreen WAG
33 33 Target TGT
34 34 Medco Health Solutions MHS
35 35 Apple AAPL
36 36 Boeing BA
37 37 State Farm Insurance Cos. SNPAX
38 38 Microsoft MSFT
39 39 Archer Daniels Midland ADM
40 40 Johnson & Johnson JNJ
41 41 Dell DELL
42 42 WellPoint WLP
43 43 PepsiCo PEP
44 44 United Technologies UTX
45 45 Dow Chemical DOW
46 46 MetLife MET
47 47 Best Buy BBY
48 48 United Parcel Service UPS
49 49 Kraft Foods KFT
50 50 Lowe's LOW

View File

@@ -35,10 +35,10 @@ public class HeadlinePuller {
private static final int NO_ARGS = 5; private static final int NO_ARGS = 5;
private static final int STOCK_SYMBOL_CSV_NOT_FOUND = 6; private static final int STOCK_SYMBOL_CSV_NOT_FOUND = 6;
@Autowired //@Autowired
HeadlineService mySQLHeadlineServiceImpl; //HeadlineService mySQLHeadlineServiceImpl;
@Autowired //@Autowired
HeadlineService yahooHeadlineServiceImpl; //HeadlineService yahooHeadlineServiceImpl;
private static void printUsage() { private static void printUsage() {
System.out System.out
@@ -115,10 +115,10 @@ public class HeadlinePuller {
for (calendar.setTime(startDate); (today = calendar.getTime()) for (calendar.setTime(startDate); (today = calendar.getTime())
.compareTo(endDate) <= 0; calendar .compareTo(endDate) <= 0; calendar
.add(Calendar.DATE, 1)) { .add(Calendar.DATE, 1)) {
List<Headline> headlines = headlinePuller.pullHeadlines( //List<Headline> headlines = headlinePuller.pullHeadlines(
company.getStockSymbol(), today); // company.getStockSymbol(), today);
int[] updates = headlinePuller.mySQLHeadlineServiceImpl.insertHeadlines(headlines); //int[] updates = headlinePuller.mySQLHeadlineServiceImpl.insertHeadlines(headlines);
System.out.println(updates.length + " rows updated"); //System.out.println(updates.length + " rows updated");
} }
} }
} catch (FileNotFoundException fnfe) { } catch (FileNotFoundException fnfe) {
@@ -132,12 +132,12 @@ public class HeadlinePuller {
} }
} }
private List<Headline> pullHeadlines(String stockSymbol, Date date) { //private List<Headline> pullHeadlines(String stockSymbol, Date date) {
List<Headline> headlines = yahooHeadlineServiceImpl.getHeadlines( //List<Headline> headlines = yahooHeadlineServiceImpl.getHeadlines(
stockSymbol, date); // stockSymbol, date);
System.out.println("Pulled " + headlines.size() + " headlines for " + stockSymbol + " on " + date); //System.out.println("Pulled " + headlines.size() + " headlines for " + stockSymbol + " on " + date);
return headlines; //return headlines;
} //}
private List<Company> getFortune50(File csvFile) private List<Company> getFortune50(File csvFile)
throws FileNotFoundException, IOException { throws FileNotFoundException, IOException {

View File

@@ -45,39 +45,30 @@ public class ModelGenerator {
Date startDate = null; Date startDate = null;
Date endDate = null; Date endDate = null;
Date valStart = null;
Date valEnd = null;
try { try {
startDate = dateFmt.parse("2012-01-01"); startDate = dateFmt.parse("2012-01-01");
endDate = dateFmt.parse("2012-03-31"); endDate = dateFmt.parse("2012-04-14");
valStart = dateFmt.parse("2012-04-01");
valEnd = dateFmt.parse("2012-04-14");
} catch (ParseException pe) { } catch (ParseException pe) {
System.exit(INVALID_DATE); System.exit(INVALID_DATE);
} }
List<Headline> trainingSet = new ArrayList<Headline>();
//actually, this is the TEST dataset
List<Headline> testSet = new ArrayList<Headline>();
try { try {
List<Company> fortune50 = modelGenerator List<Company> fortune50 = modelGenerator
.getFortune50(stockSymbolsCSV); .getFortune50(stockSymbolsCSV);
for (Company company : fortune50) { for (Company company : fortune50) {
System.out.println("Getting headlines for Fortune 50 company #" System.out.println("Getting headlines for Fortune 50 company #"
+ company.getId() + " (" + company.getName() + ")..."); + company.getId() + " (" + company.getName() + ")...");
List<Headline> trainingSet = modelGenerator.mySQLHeadlineServiceImpl.getHeadlines(company.getStockSymbol(), startDate, endDate); List<Headline> coTrainingSet = modelGenerator.mySQLHeadlineServiceImpl.getHeadlines(company.getStockSymbol(), startDate, endDate, 1);
System.out.println("Pulled " + trainingSet.size() + " headlines for " System.out.println("Pulled " + coTrainingSet.size() + " headlines for "
+ company.getStockSymbol() + " from " + startDate + " to " + endDate); + company.getStockSymbol() + " from " + startDate + " to " + endDate);
List<Headline> validationSet = modelGenerator.mySQLHeadlineServiceImpl.getHeadlines(company.getStockSymbol(), valStart, valEnd); List<Headline> coTestSet = modelGenerator.mySQLHeadlineServiceImpl.getHeadlines(company.getStockSymbol(), startDate, endDate, 2);
if (trainingSet.size() == 0) { trainingSet.addAll(coTrainingSet);
System.out.println("Training dataset contains 0 headlines for " + company.getName() + ", skipping model generation."); testSet.addAll(coTestSet);
continue;
}
if (validationSet.size() == 0) {
System.out.println("Validation dataset contains 0 headlines for " + company.getName() + ", skipping model generation.");
continue;
}
modelGenerator.ngramModel.reportModel(trainingSet, validationSet);
System.out.println("Finished " + company.getId() + " / 50");
} }
} catch (FileNotFoundException fnfe) { } catch (FileNotFoundException fnfe) {
System.out.println("Stock symbol CSV file does not exist: " System.out.println("Stock symbol CSV file does not exist: "
@@ -88,6 +79,8 @@ public class ModelGenerator {
+ stockSymbolsCSV); + stockSymbolsCSV);
System.exit(IO_EXCEPTION); System.exit(IO_EXCEPTION);
} }
//modelGenerator.ngramModel.reportModel(trainingSet, testSet);
} }
private List<Company> getFortune50(File csvFile) private List<Company> getFortune50(File csvFile)

View File

@@ -0,0 +1,180 @@
package net.woodyfolsom.cs6601.p3;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.FileSystemXmlApplicationContext;
import org.springframework.stereotype.Component;
import net.woodyfolsom.cs6601.p3.domain.Company;
import net.woodyfolsom.cs6601.p3.domain.Headline;
import net.woodyfolsom.cs6601.p3.domain.StockPrice;
import net.woodyfolsom.cs6601.p3.ngram.NGramModel;
import net.woodyfolsom.cs6601.p3.svc.HeadlineService;
@Component
public class PricePoller {
private static final File stockSymbolsCSV = new File("stock_symbols.csv");
private static final int INVALID_DATE = 1;
private static final int IO_EXCEPTION = 2;
private static final int STOCK_SYMBOL_CSV_NOT_FOUND = 3;
@Autowired
HeadlineService mySQLHeadlineServiceImpl;
private NGramModel ngramModel = new NGramModel();
public static void main(String... args) {
ApplicationContext context = new FileSystemXmlApplicationContext(
new String[] { "AppContext.xml" });
PricePoller modelGenerator = context.getBean(PricePoller.class);
DateFormat dateFmt = new SimpleDateFormat("yyyy-MM-dd");
Date startDate = null;
Date endDate = null;
try {
startDate = dateFmt.parse("2012-01-01");
endDate = dateFmt.parse("2012-04-14");
} catch (ParseException pe) {
System.exit(INVALID_DATE);
}
List<Headline> trainingSet = new ArrayList<Headline>();
//actually, this is the TEST dataset
List<Headline> testSet = new ArrayList<Headline>();
Map<String,Map<Date,StockPrice>> stockTrends = new HashMap<String,Map<Date,StockPrice>>();
try {
List<Company> fortune50 = modelGenerator
.getFortune50(stockSymbolsCSV);
for (Company company : fortune50) {
stockTrends.put(company.getStockSymbol(), new HashMap<Date,StockPrice>());
System.out.println("Polling price data for " + company.getName());
File stockPriceFile = new File("data" + File.separator + company.getStockSymbol() + ".txt");
BufferedReader buf;
try {
buf = new BufferedReader(new InputStreamReader(new FileInputStream(stockPriceFile)));
} catch (FileNotFoundException fnfe) {
System.out.println("Unable to find historical stock data file for: " + company.getStockSymbol());
continue;
}
String line;
int linesRead = 0;
try {
while ((line = buf.readLine()) != null) {
linesRead++;
if (linesRead == 1) {
continue; // header line
}
String[] fields = line.trim().split(",");
Date date;
try {
date = dateFmt.parse(fields[0]);
} catch (ParseException pe) {
System.out.println("Error parsing date: " + fields[0]);
continue;
}
if (date.compareTo(endDate) > 0) {
continue;
}
if (date.compareTo(startDate) < 0) {
break;
}
double open;
double high;
double low;
double close;
long volume;
double adjClose;
try {
open = Double.parseDouble(fields[1]);
high = Double.parseDouble(fields[2]);
low = Double.parseDouble(fields[3]);
close = Double.parseDouble(fields[4]);
volume = Long.parseLong(fields[5]);
adjClose = Double.parseDouble(fields[6]);
} catch (NumberFormatException nfe) {
System.out.println(nfe.getMessage());
continue;
}
StockPrice stockPrice = new StockPrice(date,open,high,low,close,volume,adjClose);
stockTrends.get(company.getStockSymbol()).put(date,stockPrice);
}
} catch (IOException ioe) {
System.err.println(ioe.getMessage());
continue;
}
try {
buf.close();
} catch (IOException ioe) {
System.err.println(ioe.getMessage());
}
}
for (Company company : fortune50) {
System.out.println("Getting headlines for Fortune 50 company #"
+ company.getId() + " (" + company.getName() + ")...");
List<Headline> coTrainingSet = modelGenerator.mySQLHeadlineServiceImpl.getHeadlines(company.getStockSymbol(), startDate, endDate, 1);
System.out.println("Pulled " + coTrainingSet.size() + " TRAINING headlines for "
+ company.getStockSymbol() + " from " + startDate + " to " + endDate);
List<Headline> coTestSet = modelGenerator.mySQLHeadlineServiceImpl.getHeadlines(company.getStockSymbol(), startDate, endDate, 2);
System.out.println("Pulled " + coTestSet.size() + " TEST headlines for "
+ company.getStockSymbol() + " from " + startDate + " to " + endDate);
trainingSet.addAll(coTrainingSet);
testSet.addAll(coTestSet);
}
} catch (FileNotFoundException fnfe) {
System.out.println("Stock symbol CSV file does not exist: "
+ stockSymbolsCSV);
System.exit(STOCK_SYMBOL_CSV_NOT_FOUND);
} catch (IOException ioe) {
System.out.println("Stock symbol CSV file does not exist: "
+ stockSymbolsCSV);
System.exit(IO_EXCEPTION);
}
modelGenerator.ngramModel.reportModel(trainingSet, testSet, stockTrends);
}
private List<Company> getFortune50(File csvFile)
throws FileNotFoundException, IOException {
List<Company> fortune50 = new ArrayList<Company>();
FileInputStream fis = new FileInputStream(csvFile);
InputStreamReader reader = new InputStreamReader(fis);
BufferedReader buf = new BufferedReader(reader);
String csvline = null;
while ((csvline = buf.readLine()) != null) {
if (csvline.length() == 0) {
continue;
}
String[] fields = csvline.split(",");
if (fields.length != 3) {
throw new RuntimeException(
"Badly formatted csv file name (3 values expected): "
+ csvline);
}
int id = Integer.valueOf(fields[0]);
fortune50.add(new Company(id, fields[1], fields[2]));
}
return fortune50;
}
}

View File

@@ -0,0 +1,25 @@
package net.woodyfolsom.cs6601.p3;
import java.util.Calendar;
import java.util.Date;
import net.woodyfolsom.cs6601.p3.domain.StockPrice;
public class StockUtil {
public static Date getNextTradingDay(Date date) {
Calendar cal = Calendar.getInstance();
cal.setTime(date);
do {
cal.add(Calendar.DATE, 1);
} while (cal.get(Calendar.DAY_OF_WEEK) == 1 || cal.get(Calendar.DAY_OF_WEEK) == 7);
return cal.getTime();
}
public static double getPercentChange(StockPrice stockPrice) {
double close = stockPrice.getClose();
double open = stockPrice.getOpen();
//If close is 2x open, pct change is 1.0;
//If close is 0.9 * open, pct change is -0.10;
return 100.0 * ((close / open) - 1.00);
}
}

View File

@@ -0,0 +1,256 @@
package net.woodyfolsom.cs6601.p3;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.text.DateFormat;
import java.text.DecimalFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.FileSystemXmlApplicationContext;
import org.springframework.stereotype.Component;
import net.woodyfolsom.cs6601.p3.domain.Company;
import net.woodyfolsom.cs6601.p3.domain.Headline;
import net.woodyfolsom.cs6601.p3.domain.StockPrice;
import net.woodyfolsom.cs6601.p3.ngram.NGram;
import net.woodyfolsom.cs6601.p3.ngram.NGramModel;
import net.woodyfolsom.cs6601.p3.svc.HeadlineService;
@Component
public class ValidationSetCreator {
private static final File stockSymbolsCSV = new File("stock_symbols.csv");
private static final int INVALID_DATE = 1;
private static final int IO_EXCEPTION = 2;
private static final int STOCK_SYMBOL_CSV_NOT_FOUND = 3;
@Autowired
HeadlineService mySQLHeadlineServiceImpl;
public static void main(String... args) {
ApplicationContext context = new FileSystemXmlApplicationContext(
new String[] { "AppContext.xml" });
ValidationSetCreator modelGenerator = context
.getBean(ValidationSetCreator.class);
DateFormat dateFmt = new SimpleDateFormat("yyyy-MM-dd");
Date startDate = null;
Date endDate = null;
try {
startDate = dateFmt.parse("2012-01-01");
endDate = dateFmt.parse("2012-04-14");
} catch (ParseException pe) {
System.exit(INVALID_DATE);
}
Map<String, Map<Date, StockPrice>> stockTrends = new HashMap<String, Map<Date, StockPrice>>();
try {
List<Company> fortune50 = modelGenerator
.getFortune50(stockSymbolsCSV);
for (Company company : fortune50) {
stockTrends.put(company.getStockSymbol(),
new HashMap<Date, StockPrice>());
System.out.println("Polling price data for "
+ company.getName());
File stockPriceFile = new File("data" + File.separator
+ company.getStockSymbol() + ".txt");
BufferedReader buf;
try {
buf = new BufferedReader(new InputStreamReader(
new FileInputStream(stockPriceFile)));
} catch (FileNotFoundException fnfe) {
System.out
.println("Unable to find historical stock data file for: "
+ company.getStockSymbol());
continue;
}
String line;
int linesRead = 0;
try {
while ((line = buf.readLine()) != null) {
linesRead++;
if (linesRead == 1) {
continue; // header line
}
String[] fields = line.trim().split(",");
Date date;
try {
date = dateFmt.parse(fields[0]);
} catch (ParseException pe) {
System.out.println("Error parsing date: "
+ fields[0]);
continue;
}
if (date.compareTo(endDate) > 0) {
continue;
}
if (date.compareTo(startDate) < 0) {
break;
}
double open;
double high;
double low;
double close;
long volume;
double adjClose;
try {
open = Double.parseDouble(fields[1]);
high = Double.parseDouble(fields[2]);
low = Double.parseDouble(fields[3]);
close = Double.parseDouble(fields[4]);
volume = Long.parseLong(fields[5]);
adjClose = Double.parseDouble(fields[6]);
} catch (NumberFormatException nfe) {
System.out.println(nfe.getMessage());
continue;
}
StockPrice stockPrice = new StockPrice(date, open,
high, low, close, volume, adjClose);
stockTrends.get(company.getStockSymbol()).put(date,
stockPrice);
}
} catch (IOException ioe) {
System.err.println(ioe.getMessage());
continue;
}
try {
buf.close();
} catch (IOException ioe) {
System.err.println(ioe.getMessage());
}
}
List<Headline> valSet = new ArrayList<Headline>();
for (Company company : fortune50) {
System.out.println("Getting headlines for Fortune 50 company #"
+ company.getId() + " (" + company.getName() + ")...");
List<Headline> coValSet = modelGenerator.mySQLHeadlineServiceImpl
.getHeadlines(company.getStockSymbol(), startDate,
endDate, 3);
System.out.println("Pulled " + coValSet.size()
+ " VALIDATION headlines for "
+ company.getStockSymbol() + " from " + startDate
+ " to " + endDate);
valSet.addAll(coValSet);
}
File file = new File("validation.txt");
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(file)));
Map<String,Integer> headlineCount = new HashMap<String,Integer>();
Map<String,Double> totalPctChange = new HashMap<String,Double>();
for (Headline headline : valSet) {
String text = headline.getText();
Integer count = headlineCount.get(text);
Double pctChange = totalPctChange.get(text);
Date date = headline.getDate();
String stock = headline.getStock();
StockPrice stockPrice = stockTrends.get(stock).get(
StockUtil.getNextTradingDay(date));
double pctPriceChange;
if (stockPrice == null) {
pctPriceChange = 0.0;
} else {
pctPriceChange = StockUtil.getPercentChange(stockPrice);
}
if (count == null) {
headlineCount.put(text, 1);
totalPctChange.put(text, pctPriceChange);
} else {
headlineCount.put(text, count+1);
totalPctChange.put(text, pctChange + pctPriceChange);
}
}
Set<String> processedSet = new HashSet<String>();
DecimalFormat decFmt = new DecimalFormat("###0.0000");
for (Headline headline : valSet) {
String text = headline.getText();
if (processedSet.contains(text)) {
continue;
}
processedSet.add(text);
int id = headline.getId();
String stock = headline.getStock();
Date date = headline.getDate();
String dateFormatted = dateFmt.format(date);
double totalPriceChange = totalPctChange.get(text);
int totalCount = headlineCount.get(text);
StringBuilder sb = new StringBuilder();
sb.append(id);
sb.append(", ");
sb.append(stock);
sb.append(", ");
sb.append(dateFormatted);
sb.append(", ");
sb.append(decFmt.format(totalPriceChange/totalCount));
sb.append(", ");
text = text.replaceAll(
"[\'\";:,\\]\\[]", " ");
text = text.replaceAll(
"[^A-Za-z0-9 ]", "");
sb.append(text);
sb.append("\n");
writer.write(sb.toString());
}
writer.close();
} catch (FileNotFoundException fnfe) {
System.out.println("Stock symbol CSV file does not exist: "
+ stockSymbolsCSV);
System.exit(STOCK_SYMBOL_CSV_NOT_FOUND);
} catch (IOException ioe) {
System.out.println("Stock symbol CSV file does not exist: "
+ stockSymbolsCSV);
System.exit(IO_EXCEPTION);
}
}
private List<Company> getFortune50(File csvFile)
throws FileNotFoundException, IOException {
List<Company> fortune50 = new ArrayList<Company>();
FileInputStream fis = new FileInputStream(csvFile);
InputStreamReader reader = new InputStreamReader(fis);
BufferedReader buf = new BufferedReader(reader);
String csvline = null;
while ((csvline = buf.readLine()) != null) {
if (csvline.length() == 0) {
continue;
}
String[] fields = csvline.split(",");
if (fields.length != 3) {
throw new RuntimeException(
"Badly formatted csv file name (3 values expected): "
+ csvline);
}
int id = Integer.valueOf(fields[0]);
fortune50.add(new Company(id, fields[1], fields[2]));
}
return fortune50;
}
}

View File

@@ -6,12 +6,14 @@ import java.util.List;
import net.woodyfolsom.cs6601.p3.domain.Headline; import net.woodyfolsom.cs6601.p3.domain.Headline;
public interface HeadlineDao { public interface HeadlineDao {
boolean assignRandomDatasets(int training, int test, int validation);
int getCount();
int getCount(int dataset);
int deleteById(int id); int deleteById(int id);
int insert(Headline headline); int insert(Headline headline);
int[] insertBatch(List<Headline> headlines); int[] insertBatch(List<Headline> headlines);
Headline select(int id); Headline select(int id);
List<Headline> select(String stock, Date date); List<Headline> select(String stock, Date date);
List<Headline> select(String stock, Date startDate, Date endDate); List<Headline> select(String stock, Date startDate, Date endDate, int dataset);
} }

View File

@@ -18,16 +18,36 @@ import net.woodyfolsom.cs6601.p3.domain.Headline;
@Repository @Repository
public class HeadlineDaoImpl implements HeadlineDao { public class HeadlineDaoImpl implements HeadlineDao {
private static final String COUNT_ALL_QRY = "SELECT COUNT(1) FROM headlines";
private static final String COUNT_DATASET_QRY = "SELECT COUNT(1) FROM headlines where dataset = ?";
private static final String DELETE_BY_ID_STMT = "DELETE from headlines WHERE id = ?"; private static final String DELETE_BY_ID_STMT = "DELETE from headlines WHERE id = ?";
private static final String INSERT_STMT = "INSERT INTO headlines (text, date, stock, dataset) values (?, ?, ?, ?)"; private static final String INSERT_STMT = "INSERT INTO headlines (text, date, stock, dataset) values (?, ?, ?, ?)";
private static final String SELECT_BY_ID_QRY = "SELECT * from headlines WHERE id = ?"; private static final String SELECT_BY_ID_QRY = "SELECT * from headlines WHERE id = ?";
private static final String SELECT_BY_STOCK_QRY = "SELECT * from headlines WHERE stock = ? AND date = ?"; private static final String SELECT_BY_STOCK_QRY = "SELECT * from headlines WHERE stock = ? AND date = ? AND dataset = 1";
private static final String SELECT_BY_DATE_RANGE_QRY = "SELECT * from headlines WHERE stock = ? AND date >= ? AND date <= ?"; private static final String SELECT_BY_DATE_RANGE_QRY = "SELECT * from headlines WHERE stock = ? AND date >= ? AND date <= ? AND dataset = ?";
private static final String ASSIGN_RANDOM_PCT_QRY = "update headlines set dataset = (select FLOOR(RAND() * (200 - 101) + 101))";
private static final String REMAP_TRAINING_QRY = "update headlines set dataset = 1 where dataset >= 101 and dataset <= (100 + ?)";
private static final String REMAP_TEST_QRY = "update headlines set dataset = 2 where dataset >= (100 + ?) and dataset <= (100 + ?)";
private static final String REMAP_VAL_QRY = "update headlines set dataset = 3 where dataset >= (100 + ?) and dataset <= 200";
private JdbcTemplate jdbcTemplate; private JdbcTemplate jdbcTemplate;
@Override
public boolean assignRandomDatasets(int training, int test, int validation) {
if (training + test + validation != 100) {
return false;
}
jdbcTemplate.update(ASSIGN_RANDOM_PCT_QRY);
jdbcTemplate.update(REMAP_TRAINING_QRY,training);
jdbcTemplate.update(REMAP_TEST_QRY,training,training+test);
jdbcTemplate.update(REMAP_VAL_QRY,training+test);
return true;
}
public int deleteById(int headlineId) { public int deleteById(int headlineId) {
return jdbcTemplate.update(DELETE_BY_ID_STMT, return jdbcTemplate.update(DELETE_BY_ID_STMT,
new RequestMapper(), headlineId); new RequestMapper(), headlineId);
@@ -64,12 +84,12 @@ public class HeadlineDaoImpl implements HeadlineDao {
public List<Headline> select(String stock, Date date) { public List<Headline> select(String stock, Date date) {
return jdbcTemplate.query(SELECT_BY_STOCK_QRY, return jdbcTemplate.query(SELECT_BY_STOCK_QRY,
new RequestMapper(), stock, date); new RequestMapper(), stock, date, 1);
} }
public List<Headline> select(String stock, Date startDate, Date endDate) { public List<Headline> select(String stock, Date startDate, Date endDate, int dataset) {
return jdbcTemplate.query(SELECT_BY_DATE_RANGE_QRY, return jdbcTemplate.query(SELECT_BY_DATE_RANGE_QRY,
new RequestMapper(), stock, startDate, endDate); new RequestMapper(), stock, startDate, endDate, dataset);
} }
@Autowired @Autowired
@@ -82,6 +102,7 @@ public class HeadlineDaoImpl implements HeadlineDao {
@Override @Override
public Headline mapRow(ResultSet rs, int arg1) throws SQLException { public Headline mapRow(ResultSet rs, int arg1) throws SQLException {
Headline headline = new Headline(); Headline headline = new Headline();
headline.setId(rs.getInt("id"));
headline.setText(rs.getString("text")); headline.setText(rs.getString("text"));
headline.setStock(rs.getString("stock")); headline.setStock(rs.getString("stock"));
headline.setDate(rs.getDate("date")); headline.setDate(rs.getDate("date"));
@@ -90,4 +111,14 @@ public class HeadlineDaoImpl implements HeadlineDao {
} }
} }
@Override
public int getCount() {
return jdbcTemplate.queryForInt(COUNT_ALL_QRY);
}
@Override
public int getCount(int dataset) {
return jdbcTemplate.queryForInt(COUNT_DATASET_QRY,dataset);
}
} }

View File

@@ -0,0 +1,57 @@
package net.woodyfolsom.cs6601.p3.domain;
import java.util.Date;
public class StockPrice {
//Date,Open,High,Low,Close,Volume,Adj_Close
//2012-04-17,0.28,0.33,0.28,0.32,3408100,0.32
final Date date;
final double open;
final double high;
final double low;
final double close;
final double adjClose;
final long volume;
public StockPrice(Date date, double open, double high, double low, double close,
long volume, double adjClose) {
super();
this.date = date;
this.open = open;
this.high = high;
this.low = low;
this.close = close;
this.volume = volume;
this.adjClose = adjClose;
}
public Date getDate() {
return date;
}
public double getOpen() {
return open;
}
public double getClose() {
return close;
}
public double getHigh() {
return high;
}
public double getLow() {
return low;
}
public long getVolume() {
return volume;
}
public double getAdjClose() {
return adjClose;
}
}

View File

@@ -1,10 +1,17 @@
package net.woodyfolsom.cs6601.p3.ngram; package net.woodyfolsom.cs6601.p3.ngram;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStreamWriter;
import java.text.DecimalFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
@@ -14,7 +21,9 @@ import java.util.Set;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.woodyfolsom.cs6601.p3.StockUtil;
import net.woodyfolsom.cs6601.p3.domain.Headline; import net.woodyfolsom.cs6601.p3.domain.Headline;
import net.woodyfolsom.cs6601.p3.domain.StockPrice;
public class NGramModel { public class NGramModel {
static final int MAX_N_GRAM_LENGTH = 3; static final int MAX_N_GRAM_LENGTH = 3;
@@ -25,6 +34,8 @@ public class NGramModel {
private static final String UNK = "<unk>"; private static final String UNK = "<unk>";
private Map<Integer, NGramDistribution> nGrams; private Map<Integer, NGramDistribution> nGrams;
private Map<Integer, Map<NGram,Double>> nGramPriceAvg;
private int[] totalNGramCounts = new int[MAX_N_GRAM_LENGTH + 1]; private int[] totalNGramCounts = new int[MAX_N_GRAM_LENGTH + 1];
private Pattern wordPattern = Pattern.compile("\\w+"); private Pattern wordPattern = Pattern.compile("\\w+");
@@ -88,9 +99,13 @@ public class NGramModel {
for (int i = 0; i <= MAX_N_GRAM_LENGTH; i++) { for (int i = 0; i <= MAX_N_GRAM_LENGTH; i++) {
nGrams.put(i, new NGramDistribution()); nGrams.put(i, new NGramDistribution());
} }
nGramPriceAvg = new HashMap<Integer, Map<NGram,Double>>();
for (int i = 0; i <= MAX_N_GRAM_LENGTH; i++) {
nGramPriceAvg.put(i, new HashMap<NGram,Double>());
}
} }
private void addNGram(int nGramLength, NGram nGram) { private void addNGram(int nGramLength, NGram nGram, String stockName, Date date, Map<String, Map<Date,StockPrice>> stockTrends) {
if (nGram.size() < nGramLength) { if (nGram.size() < nGramLength) {
System.out.println("Cannot create " + nGramLength + "-gram from: " System.out.println("Cannot create " + nGramLength + "-gram from: "
+ nGram); + nGram);
@@ -105,10 +120,31 @@ public class NGramModel {
} else { } else {
nGramCounts.put(nGramCopy, 1); nGramCounts.put(nGramCopy, 1);
} }
Map<NGram, Double> nGramPriceAvgs = nGramPriceAvg.get(nGramLength);
NGram nGramCopy2 = nGram.copy(nGramLength);
//TODO GET NEXT TRADING DAY'S DATE
Date nextDay = StockUtil.getNextTradingDay(date);
StockPrice stockPrice = stockTrends.get(stockName).get(nextDay);
double percentChange;
if (stockPrice == null) {
percentChange = 0.0;
} else {
percentChange = StockUtil.getPercentChange(stockPrice);
}
if (nGramPriceAvgs.containsKey(nGramCopy2)) {
double totalPercentChange = nGramPriceAvgs.get(nGramCopy);
nGramPriceAvgs.put(nGramCopy2, totalPercentChange + percentChange);
} else {
nGramPriceAvgs.put(nGramCopy2, percentChange);
}
} }
/** /**
* Given an arbitrary String, replace punctutation with spaces, remove * Given an arbitrary String, replace punctuation with spaces, remove
* non-alphanumeric characters, prepend with <START> token, append <END> * non-alphanumeric characters, prepend with <START> token, append <END>
* token. * token.
* *
@@ -193,12 +229,11 @@ public class NGramModel {
+ " recognized n-grams in verification corpus: " + perplexity); + " recognized n-grams in verification corpus: " + perplexity);
} }
private void generateModel(List<Headline> traininSet, boolean genRandom, private void generateModel(List<Headline> trainingSet, boolean genRandom,
boolean useUnk) throws FileNotFoundException, IOException { boolean useUnk, Map<String,Map<Date,StockPrice>> stockTrends) throws FileNotFoundException, IOException {
StringBuilder currentLine = new StringBuilder(); //List<String> fileByLines = new ArrayList<String>();
List<String> fileByLines = new ArrayList<String>();
for (Headline headline : traininSet) { for (Headline headline : trainingSet) {
String headlineText = headline.getText(); String headlineText = headline.getText();
if (headlineText.length() == 0) { if (headlineText.length() == 0) {
continue; continue;
@@ -206,6 +241,9 @@ public class NGramModel {
String sanitizedLine = sanitize(headline.getText()); String sanitizedLine = sanitize(headline.getText());
// split on whitespace // split on whitespace
String[] tokens = sanitizedLine.toLowerCase().split("\\s+"); String[] tokens = sanitizedLine.toLowerCase().split("\\s+");
StringBuilder currentLine = new StringBuilder();
for (String token : tokens) { for (String token : tokens) {
if (!isWord(token)) { if (!isWord(token)) {
continue; continue;
@@ -222,67 +260,67 @@ public class NGramModel {
if (END.equals(word)) { if (END.equals(word)) {
currentLine.append(word); currentLine.append(word);
fileByLines.add(currentLine.toString()); //fileByLines.add(currentLine.toString());
currentLine = new StringBuilder(); //currentLine = new StringBuilder();
} else { } else {
currentLine.append(word); currentLine.append(word);
currentLine.append(" "); currentLine.append(" ");
} }
} }
}
for (String str : fileByLines) { String str = currentLine.toString();
System.out.println(str); System.out.println(str);
NGram currentNgram = new NGram(MAX_N_GRAM_LENGTH); NGram currentNgram = new NGram(MAX_N_GRAM_LENGTH);
for (String token : str.split("\\s+")) { for (String token : str.split("\\s+")) {
currentNgram.add(token); currentNgram.add(token);
for (int i = 0; i <= currentNgram.size(); i++) { for (int i = 0; i <= currentNgram.size(); i++) {
addNGram(currentNgram.size() - i, addNGram(currentNgram.size() - i,
currentNgram.subNGram(i, currentNgram.size())); currentNgram.subNGram(i, currentNgram.size()), headline.getStock(), headline.getDate(), stockTrends);
totalNGramCounts[currentNgram.size() - i]++; totalNGramCounts[currentNgram.size() - i]++;
} }
} }
} }
System.out.println("Most common words: "); DecimalFormat decFmt = new DecimalFormat("###0.0000");
for (int modelIndex = 1; modelIndex <= 3; modelIndex++) {
List<Entry<NGram, Integer>> unigrams = new ArrayList<Entry<NGram, Integer>>( File file = new File(modelIndex + "grams.txt");
nGrams.get(1).entrySet()); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file)));
Collections.sort(unigrams, new NGramComparator()); List<Entry<NGram, Integer>> ngrams = new ArrayList<Entry<NGram, Integer>>(
nGrams.get(modelIndex).entrySet());
List<Entry<NGram, Integer>> bigrams = new ArrayList<Entry<NGram, Integer>>( Collections.sort(ngrams, new NGramComparator());
nGrams.get(2).entrySet()); System.out.println("Highest frequency " + modelIndex + "-grams:");
Collections.sort(bigrams, new NGramComparator()); for (int i = 1; i <= 10; i++) {
System.out
List<Entry<NGram, Integer>> trigrams = new ArrayList<Entry<NGram, Integer>>( .println(i
nGrams.get(3).entrySet()); + ". "
Collections.sort(trigrams, new NGramComparator()); + ngrams.get(i - 1).getKey()
+ " : "
for (int i = 1; i <= 10; i++) { + (((double) (ngrams.get(i - 1).getValue()) / totalNGramCounts[1])));
System.out }
.println(i Map<NGram,Double> pricesForModel = nGramPriceAvg.get(modelIndex);
+ ". " for (int nGramIndex = 1; nGramIndex <= ngrams.size(); nGramIndex++) {
+ unigrams.get(i - 1).getKey() NGram key = ngrams.get(nGramIndex - 1).getKey();
+ " : " writer.write(key.toString());
+ (((double) (unigrams.get(i - 1).getValue()) / totalNGramCounts[1]))); writer.write(",");
} int count = ngrams.get(nGramIndex - 1).getValue();
writer.write(Integer.toString(count));
for (int i = 1; i <= 10; i++) { writer.write(",");
System.out double avgPrice;
.println(i try {
+ ". " avgPrice = pricesForModel.get(key);
+ bigrams.get(i - 1).getKey() System.out.println("Avg price for " + modelIndex + "-gram " + key +": " + avgPrice);
+ " : " } catch (NullPointerException npe) {
+ (((double) (bigrams.get(i - 1).getValue()) / totalNGramCounts[2]))); System.out.println("null avgPrice for " + modelIndex + "-gram " + key);
} avgPrice = 0.0;
}
for (int i = 1; i <= 10; i++) { writer.write(decFmt.format(avgPrice/(double)count));
System.out writer.write("\n");
.println(i }
+ ". " try {
+ trigrams.get(i - 1).getKey() writer.close();
+ " : " } catch (IOException ioe) {
+ (((double) (trigrams.get(i - 1).getValue()) / totalNGramCounts[3]))); System.out.println(ioe.getMessage());
}
} }
if (genRandom) { if (genRandom) {
for (int nGramLength = 1; nGramLength <= MAX_N_GRAM_LENGTH; nGramLength++) { for (int nGramLength = 1; nGramLength <= MAX_N_GRAM_LENGTH; nGramLength++) {
@@ -333,11 +371,11 @@ public class NGramModel {
} }
public void reportModel(List<Headline> trainingSet, public void reportModel(List<Headline> trainingSet,
List<Headline> validationSet) { List<Headline> validationSet, Map<String,Map<Date,StockPrice>> stockTrends) {
try { try {
NGramModel ngm = new NGramModel(); NGramModel ngm = new NGramModel();
boolean doCalcPerplexity = true; boolean doCalcPerplexity = true;
ngm.generateModel(trainingSet, !doCalcPerplexity, doCalcPerplexity); ngm.generateModel(trainingSet, !doCalcPerplexity, doCalcPerplexity, stockTrends);
if (doCalcPerplexity) { if (doCalcPerplexity) {
for (int i = 1; i <= MAX_N_GRAM_LENGTH; i++) { for (int i = 1; i <= MAX_N_GRAM_LENGTH; i++) {
ngm.calcPerplexity(validationSet, i, true); ngm.calcPerplexity(validationSet, i, true);

View File

@@ -6,8 +6,11 @@ import java.util.List;
import net.woodyfolsom.cs6601.p3.domain.Headline; import net.woodyfolsom.cs6601.p3.domain.Headline;
public interface HeadlineService { public interface HeadlineService {
boolean assignRandomDatasets(int training, int test, int validation);
int getCount();
int getCount(int dataset);
int insertHeadline(Headline headline); int insertHeadline(Headline headline);
int[] insertHeadlines(List<Headline> headline); int[] insertHeadlines(List<Headline> headline);
List<Headline> getHeadlines(String stock, Date date); List<Headline> getHeadlines(String stock, Date date);
List<Headline> getHeadlines(String stock, Date startDate, Date endDate); List<Headline> getHeadlines(String stock, Date startDate, Date endDate, int dataset);
} }

View File

@@ -34,7 +34,22 @@ public class MySQLHeadlineServiceImpl implements HeadlineService {
} }
@Override @Override
public List<Headline> getHeadlines(String stock, Date startDate, Date endDate) { public List<Headline> getHeadlines(String stock, Date startDate, Date endDate, int dataset) {
return headlineDao.select(stock, startDate, endDate); return headlineDao.select(stock, startDate, endDate, dataset);
}
@Override
public boolean assignRandomDatasets(int training, int test, int validation) {
return headlineDao.assignRandomDatasets(training, test, validation);
}
@Override
public int getCount() {
return headlineDao.getCount();
}
@Override
public int getCount(int dataset) {
return headlineDao.getCount(dataset);
} }
} }

View File

@@ -89,7 +89,22 @@ public class YahooHeadlineServiceImpl implements HeadlineService {
@Override @Override
public List<Headline> getHeadlines(String stock, Date startDate, public List<Headline> getHeadlines(String stock, Date startDate,
Date endDate) { Date endDate, int dataset) {
throw new UnsupportedOperationException("This implementation does not support getting headlines for a date range."); throw new UnsupportedOperationException("This implementation does not support getting headlines for a date range.");
} }
@Override
public boolean assignRandomDatasets(int training, int test, int validation) {
throw new UnsupportedOperationException("This implementation does not support this method.");
}
@Override
public int getCount() {
throw new UnsupportedOperationException("This implementation does not support this method");
}
@Override
public int getCount(int dataset) {
throw new UnsupportedOperationException("This implementation does not support this method");
}
} }

View File

@@ -1,21 +1,23 @@
package net.woodyfolsom.cs6601.p3.dao; package net.woodyfolsom.cs6601.p3.dao;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import net.woodyfolsom.cs6601.p3.svc.HeadlineService; import net.woodyfolsom.cs6601.p3.svc.HeadlineService;
import org.junit.BeforeClass; import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import org.springframework.context.ApplicationContext; import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext; import org.springframework.context.support.FileSystemXmlApplicationContext;
public class MySQLHeadlineDaoImplTest { public class MySQLHeadlineDaoImplTest {
private static HeadlineService headlineSvc; private static HeadlineService headlineSvc;
@BeforeClass @BeforeClass
public static void setUp() { public static void setUp() {
ApplicationContext context=new ClassPathXmlApplicationContext(new String[]{"/AppContext.xml"}); ApplicationContext context=new FileSystemXmlApplicationContext(new String[]{"AppContext.xml"});
headlineSvc = (HeadlineService) context headlineSvc = (HeadlineService) context
.getBean("mySQLHeadlineSvc"); .getBean("mySQLHeadlineSvc");
} }
@@ -24,4 +26,29 @@ public class MySQLHeadlineDaoImplTest {
public void testSelect() { public void testSelect() {
assertNotNull(headlineSvc); assertNotNull(headlineSvc);
} }
//Change this back to @Test to run it... but beware, it shuffles the datasets. Best done n times for n-fold cross validation.
@Ignore
public void testAssignRandomDatasets() {
int trainingPct = 80;
int testPct = 10;
int valPct = 10;
//assignment fails if character is ommitted from valPct (80% 10% 1% by accident)
assertFalse(headlineSvc.assignRandomDatasets(trainingPct,testPct,valPct/10));
//assignment succeeds if requested ratio is 8-1-1
assertTrue(headlineSvc.assignRandomDatasets(trainingPct,testPct,valPct));
int allCount = headlineSvc.getCount();
int trainingCount = headlineSvc.getCount(1);
int testCount = headlineSvc.getCount(2);
int valCount = headlineSvc.getCount(3);
assertEquals(trainingCount + testCount + valCount, allCount);
assertEquals((double)trainingCount/allCount,(double)trainingPct / 100.0,0.01);
assertEquals((double)testCount/allCount,(double)testPct / 100.0,0.01);
assertEquals((double)valCount/allCount,(double)valPct / 100.0,0.01);
}
} }