Added script to pull historical stock data and resulting data files (1 per company). Added code to generate average price change per 1, 2 and 3-gram. Added code to output average price change per headline for VALIDATION dataset.
This commit is contained in:
6965
data/AAPL.txt
Normal file
6965
data/AAPL.txt
Normal file
File diff suppressed because it is too large
Load Diff
4286
data/ABC.txt
Normal file
4286
data/ABC.txt
Normal file
File diff suppressed because it is too large
Load Diff
7319
data/ADM.txt
Normal file
7319
data/ADM.txt
Normal file
File diff suppressed because it is too large
Load Diff
6964
data/AIG.txt
Normal file
6964
data/AIG.txt
Normal file
File diff suppressed because it is too large
Load Diff
12662
data/BA.txt
Normal file
12662
data/BA.txt
Normal file
File diff suppressed because it is too large
Load Diff
6530
data/BAC.txt
Normal file
6530
data/BAC.txt
Normal file
File diff suppressed because it is too large
Load Diff
6810
data/BBY.txt
Normal file
6810
data/BBY.txt
Normal file
File diff suppressed because it is too large
Load Diff
0
data/BRKA.txt
Normal file
0
data/BRKA.txt
Normal file
8906
data/C.txt
Normal file
8906
data/C.txt
Normal file
File diff suppressed because it is too large
Load Diff
6126
data/CAH.txt
Normal file
6126
data/CAH.txt
Normal file
File diff suppressed because it is too large
Load Diff
7643
data/COP.txt
Normal file
7643
data/COP.txt
Normal file
File diff suppressed because it is too large
Load Diff
6502
data/COST.txt
Normal file
6502
data/COST.txt
Normal file
File diff suppressed because it is too large
Load Diff
6888
data/CVS.txt
Normal file
6888
data/CVS.txt
Normal file
File diff suppressed because it is too large
Load Diff
10675
data/CVX.txt
Normal file
10675
data/CVX.txt
Normal file
File diff suppressed because it is too large
Load Diff
5968
data/DELL.txt
Normal file
5968
data/DELL.txt
Normal file
File diff suppressed because it is too large
Load Diff
8905
data/DOW.txt
Normal file
8905
data/DOW.txt
Normal file
File diff suppressed because it is too large
Load Diff
8906
data/F.txt
Normal file
8906
data/F.txt
Normal file
File diff suppressed because it is too large
Load Diff
5886
data/FMCC.txt
Normal file
5886
data/FMCC.txt
Normal file
File diff suppressed because it is too large
Load Diff
8899
data/FNMA.txt
Normal file
8899
data/FNMA.txt
Normal file
File diff suppressed because it is too large
Load Diff
12662
data/GE.txt
Normal file
12662
data/GE.txt
Normal file
File diff suppressed because it is too large
Load Diff
355
data/GM.txt
Normal file
355
data/GM.txt
Normal file
@@ -0,0 +1,355 @@
|
||||
Date,Open,High,Low,Close,Volume,Adj_Close
|
||||
2012-04-16,23.78,23.93,23.36,23.42,7528500,23.42
|
||||
2012-04-13,24.22,24.29,23.62,23.80,7310900,23.80
|
||||
2012-04-12,24.05,24.49,24.00,24.30,5041800,24.30
|
||||
2012-04-11,23.84,24.28,23.82,24.03,9260900,24.03
|
||||
2012-04-10,24.11,24.14,23.29,23.71,11856100,23.71
|
||||
2012-04-09,24.22,24.40,24.03,24.20,7242200,24.20
|
||||
2012-04-05,24.99,25.18,24.61,24.81,8211300,24.81
|
||||
2012-04-04,25.10,25.40,24.90,25.10,9785500,25.10
|
||||
2012-04-03,26.64,27.03,25.27,25.54,26410400,25.54
|
||||
2012-04-02,26.03,26.91,25.98,26.76,14465300,26.76
|
||||
2012-03-30,25.49,25.78,25.11,25.65,6035000,25.65
|
||||
2012-03-29,25.02,25.37,24.94,25.31,6433300,25.31
|
||||
2012-03-28,25.41,25.48,25.02,25.15,5162000,25.15
|
||||
2012-03-27,25.42,25.85,25.34,25.35,6632500,25.35
|
||||
2012-03-26,25.23,25.63,25.14,25.58,7957700,25.58
|
||||
2012-03-23,25.04,25.32,24.77,25.17,5802800,25.17
|
||||
2012-03-22,25.01,25.26,24.87,25.05,5605800,25.05
|
||||
2012-03-21,25.17,25.49,25.10,25.29,6122800,25.29
|
||||
2012-03-20,25.10,25.42,24.60,25.09,10819300,25.09
|
||||
2012-03-19,25.50,25.70,25.24,25.39,5462700,25.39
|
||||
2012-03-16,26.07,26.07,25.26,25.57,11752400,25.57
|
||||
2012-03-15,26.37,26.58,25.88,26.05,10133700,26.05
|
||||
2012-03-14,26.13,26.59,26.00,26.32,9082600,26.32
|
||||
2012-03-13,25.48,26.10,25.33,26.07,8338500,26.07
|
||||
2012-03-12,25.62,25.71,25.15,25.39,4006300,25.39
|
||||
2012-03-09,25.70,25.82,25.39,25.62,6323600,25.62
|
||||
2012-03-08,25.37,25.59,25.10,25.45,8646700,25.45
|
||||
2012-03-07,24.75,25.21,24.75,24.88,6856400,24.88
|
||||
2012-03-06,25.31,25.34,24.38,24.58,19071300,24.58
|
||||
2012-03-05,26.27,26.40,25.80,26.00,7624700,26.00
|
||||
2012-03-02,26.55,26.75,26.36,26.45,8442800,26.45
|
||||
2012-03-01,26.19,26.80,26.15,26.47,8893300,26.47
|
||||
2012-02-29,26.07,26.55,25.92,26.02,12018900,26.02
|
||||
2012-02-28,26.49,26.53,26.10,26.14,7547000,26.14
|
||||
2012-02-27,26.07,26.55,25.57,26.46,8600900,26.46
|
||||
2012-02-24,26.90,26.95,26.06,26.07,9404600,26.07
|
||||
2012-02-23,26.69,27.27,26.26,26.79,10300900,26.79
|
||||
2012-02-22,27.08,27.13,26.53,26.55,10693300,26.55
|
||||
2012-02-21,27.30,27.55,26.99,27.06,8593200,27.06
|
||||
2012-02-17,27.17,27.68,27.01,27.34,17603600,27.34
|
||||
2012-02-16,25.29,27.26,25.27,27.17,35313600,27.17
|
||||
2012-02-15,25.73,25.75,24.90,24.93,13292000,24.93
|
||||
2012-02-14,25.21,25.45,25.00,25.40,9885800,25.40
|
||||
2012-02-13,26.00,26.00,25.21,25.34,10503600,25.34
|
||||
2012-02-10,25.48,25.52,25.25,25.50,9819000,25.50
|
||||
2012-02-09,25.96,26.22,25.50,25.74,7440000,25.74
|
||||
2012-02-08,26.31,26.42,25.58,25.75,17392900,25.75
|
||||
2012-02-07,26.62,26.62,26.15,26.22,10764100,26.22
|
||||
2012-02-06,26.47,26.83,25.95,26.70,17264100,26.70
|
||||
2012-02-03,25.00,26.44,24.79,26.18,25510600,26.18
|
||||
2012-02-02,24.65,24.69,24.30,24.31,6670900,24.31
|
||||
2012-02-01,24.33,24.58,24.07,24.37,13337500,24.37
|
||||
2012-01-31,24.53,24.59,23.95,24.02,9190800,24.02
|
||||
2012-01-30,24.06,24.57,23.95,24.23,6319800,24.23
|
||||
2012-01-27,23.80,24.54,23.33,24.37,14784100,24.37
|
||||
2012-01-26,25.10,25.50,24.60,24.72,11585300,24.72
|
||||
2012-01-25,24.89,25.01,24.38,24.92,13441000,24.92
|
||||
2012-01-24,24.84,24.93,24.50,24.79,8350000,24.79
|
||||
2012-01-23,25.14,25.25,24.79,24.92,8923500,24.92
|
||||
2012-01-20,24.93,25.11,24.84,25.00,9840500,25.00
|
||||
2012-01-19,24.69,24.98,24.45,24.82,15222800,24.82
|
||||
2012-01-18,24.27,24.58,24.02,24.51,8919200,24.51
|
||||
2012-01-17,24.60,24.68,24.17,24.20,9385800,24.20
|
||||
2012-01-13,24.27,24.65,23.91,24.29,12963100,24.29
|
||||
2012-01-12,24.35,24.82,23.76,24.67,16750800,24.67
|
||||
2012-01-11,23.37,24.64,23.34,24.47,21892300,24.47
|
||||
2012-01-10,23.22,23.40,22.78,23.24,13538300,23.24
|
||||
2012-01-09,23.20,23.43,22.70,22.84,12084500,22.84
|
||||
2012-01-06,22.26,23.03,22.24,22.92,18234500,22.92
|
||||
2012-01-05,21.10,22.29,20.96,22.17,17880600,22.17
|
||||
2012-01-04,21.05,21.37,20.75,21.15,7856700,21.15
|
||||
2012-01-03,20.83,21.18,20.75,21.05,9321300,21.05
|
||||
2011-12-30,20.12,20.37,20.05,20.27,6971400,20.27
|
||||
2011-12-29,19.85,20.25,19.71,20.21,6891400,20.21
|
||||
2011-12-28,20.08,20.13,19.76,19.86,7116200,19.86
|
||||
2011-12-27,20.43,20.43,20.08,20.09,5866200,20.09
|
||||
2011-12-23,20.81,20.89,20.45,20.50,6480600,20.50
|
||||
2011-12-22,20.25,20.85,20.17,20.70,7287200,20.70
|
||||
2011-12-21,19.74,20.44,19.58,20.32,13054600,20.32
|
||||
2011-12-20,19.42,19.90,19.35,19.69,13952500,19.69
|
||||
2011-12-19,20.12,20.23,19.00,19.05,15608900,19.05
|
||||
2011-12-16,20.16,20.52,19.97,20.15,9109900,20.15
|
||||
2011-12-15,19.76,20.22,19.51,20.10,10152400,20.10
|
||||
2011-12-14,19.95,20.01,19.42,19.47,15053700,19.47
|
||||
2011-12-13,20.96,21.00,19.95,20.11,12647100,20.11
|
||||
2011-12-12,20.66,20.90,20.56,20.80,6281600,20.80
|
||||
2011-12-09,21.09,21.40,21.06,21.15,6491100,21.15
|
||||
2011-12-08,21.76,21.76,20.85,20.98,11508500,20.98
|
||||
2011-12-07,21.62,22.11,21.48,21.94,9194700,21.94
|
||||
2011-12-06,21.60,21.88,21.28,21.68,10668900,21.68
|
||||
2011-12-05,21.52,21.99,21.43,21.59,9808700,21.59
|
||||
2011-12-02,21.35,21.73,21.13,21.28,9781200,21.28
|
||||
2011-12-01,21.24,21.77,20.93,20.96,12128500,20.96
|
||||
2011-11-30,21.02,21.31,20.94,21.29,11147700,21.29
|
||||
2011-11-29,20.73,20.85,20.30,20.31,7665000,20.31
|
||||
2011-11-28,20.95,21.43,20.65,20.74,8959100,20.74
|
||||
2011-11-25,20.22,20.71,20.21,20.34,2757600,20.34
|
||||
2011-11-23,20.48,20.69,20.06,20.24,9594500,20.24
|
||||
2011-11-22,20.92,21.12,20.70,20.73,7288600,20.73
|
||||
2011-11-21,21.10,21.18,20.54,21.05,11757900,21.05
|
||||
2011-11-18,22.01,22.18,21.62,21.68,6592500,21.68
|
||||
2011-11-17,22.72,22.75,21.56,21.79,12692700,21.79
|
||||
2011-11-16,23.24,23.35,22.62,22.65,9297200,22.65
|
||||
2011-11-15,22.81,23.53,22.60,23.35,13867200,23.35
|
||||
2011-11-14,22.56,23.29,22.51,22.99,13433300,22.99
|
||||
2011-11-11,22.95,23.10,22.22,22.51,15438300,22.51
|
||||
2011-11-10,22.44,22.85,21.93,22.70,15866600,22.70
|
||||
2011-11-09,23.07,23.57,22.15,22.31,32911600,22.31
|
||||
2011-11-08,24.20,25.17,23.98,25.04,19891800,25.04
|
||||
2011-11-07,23.93,24.15,23.52,24.01,9802800,24.01
|
||||
2011-11-04,23.89,23.92,23.31,23.61,9452000,23.61
|
||||
2011-11-03,23.82,24.12,22.76,24.03,19953000,24.03
|
||||
2011-11-02,23.70,23.73,22.92,23.20,14368700,23.20
|
||||
2011-11-01,24.82,24.90,23.25,23.33,25365200,23.33
|
||||
2011-10-31,25.92,26.16,25.61,25.85,8853600,25.85
|
||||
2011-10-28,26.22,26.55,26.00,26.45,10718700,26.45
|
||||
2011-10-27,25.87,26.47,25.20,26.32,18428600,26.32
|
||||
2011-10-26,25.02,25.28,24.69,24.99,17497500,24.99
|
||||
2011-10-25,24.86,25.19,24.16,24.86,11384500,24.86
|
||||
2011-10-24,24.28,25.24,24.25,24.98,13534100,24.98
|
||||
2011-10-21,23.40,24.38,23.20,24.35,12352500,24.35
|
||||
2011-10-20,23.02,23.18,22.51,22.96,9488400,22.96
|
||||
2011-10-19,23.52,23.55,22.96,23.09,7600700,23.09
|
||||
2011-10-18,23.19,23.87,22.77,23.54,14135700,23.54
|
||||
2011-10-17,24.17,24.19,23.15,23.18,8633400,23.18
|
||||
2011-10-14,23.68,24.16,23.36,24.16,14753200,24.16
|
||||
2011-10-13,23.38,23.38,22.50,23.15,9858600,23.15
|
||||
2011-10-12,22.94,23.97,22.86,23.41,17303900,23.41
|
||||
2011-10-11,22.43,22.69,22.28,22.50,10258300,22.50
|
||||
2011-10-10,22.51,22.97,22.36,22.62,11291700,22.62
|
||||
2011-10-07,22.58,22.93,21.82,22.01,14476800,22.01
|
||||
2011-10-06,22.30,22.60,21.75,22.35,13553000,22.35
|
||||
2011-10-05,21.33,22.29,20.76,22.27,17155800,22.27
|
||||
2011-10-04,19.45,21.46,19.05,21.42,23800000,21.42
|
||||
2011-10-03,20.13,20.90,19.65,19.73,13950700,19.73
|
||||
2011-09-30,20.44,20.50,20.10,20.18,10151100,20.18
|
||||
2011-09-29,20.72,20.97,20.12,20.76,10781000,20.76
|
||||
2011-09-28,21.20,21.44,20.37,20.41,10120900,20.41
|
||||
2011-09-27,21.60,21.83,21.08,21.19,11170900,21.19
|
||||
2011-09-26,21.32,21.44,20.53,21.08,8503500,21.08
|
||||
2011-09-23,19.77,21.28,19.77,21.00,13722000,21.00
|
||||
2011-09-22,20.59,20.99,20.04,20.24,17284200,20.24
|
||||
2011-09-21,22.39,22.70,21.22,21.28,9850700,21.28
|
||||
2011-09-20,23.05,23.10,22.42,22.43,9507100,22.43
|
||||
2011-09-19,22.15,23.17,22.05,23.05,14082400,23.05
|
||||
2011-09-16,22.68,22.77,22.34,22.61,7932400,22.61
|
||||
2011-09-15,22.59,22.79,22.15,22.70,8039200,22.70
|
||||
2011-09-14,22.22,22.49,21.62,22.18,11623100,22.18
|
||||
2011-09-13,21.87,22.33,21.50,22.00,11189600,22.00
|
||||
2011-09-12,21.15,21.95,21.00,21.87,9325700,21.87
|
||||
2011-09-09,22.36,22.45,21.47,21.76,11920600,21.76
|
||||
2011-09-08,22.79,23.13,22.24,22.48,11782500,22.48
|
||||
2011-09-07,21.82,23.04,21.82,22.86,13412200,22.86
|
||||
2011-09-06,21.36,21.58,20.88,21.44,14282500,21.44
|
||||
2011-09-02,22.41,22.55,21.73,22.07,14086700,22.07
|
||||
2011-09-01,24.09,24.25,22.91,23.03,16926200,23.03
|
||||
2011-08-31,23.87,24.49,23.83,24.03,12329600,24.03
|
||||
2011-08-30,23.46,23.74,23.05,23.58,8142900,23.58
|
||||
2011-08-29,23.30,23.96,23.27,23.79,10642100,23.79
|
||||
2011-08-26,22.20,23.08,21.80,22.87,10278000,22.87
|
||||
2011-08-25,22.49,22.74,22.10,22.30,11099900,22.30
|
||||
2011-08-24,21.89,22.39,21.57,22.37,14527400,22.37
|
||||
2011-08-23,21.78,22.06,21.21,22.06,20007100,22.06
|
||||
2011-08-22,22.54,22.72,21.18,21.71,27158700,21.71
|
||||
2011-08-19,23.25,23.44,21.71,22.16,34432900,22.16
|
||||
2011-08-18,23.99,24.08,23.27,23.60,15853700,23.60
|
||||
2011-08-17,26.00,26.09,24.90,24.94,12062000,24.94
|
||||
2011-08-16,26.22,26.69,25.69,25.83,10145500,25.83
|
||||
2011-08-15,26.03,26.54,25.79,26.42,12994800,26.42
|
||||
2011-08-12,26.30,26.50,25.49,25.75,18861500,25.75
|
||||
2011-08-11,24.11,26.17,24.11,25.81,25179600,25.81
|
||||
2011-08-10,24.66,25.00,23.83,23.92,20642400,23.92
|
||||
2011-08-09,25.09,25.56,24.00,25.54,26704600,25.54
|
||||
2011-08-08,24.61,25.35,23.79,24.57,32608700,24.57
|
||||
2011-08-05,26.07,26.46,24.49,26.31,34926100,26.31
|
||||
2011-08-04,27.16,27.20,25.71,25.99,38839500,25.99
|
||||
2011-08-03,27.03,27.17,26.13,27.17,15790400,27.17
|
||||
2011-08-02,27.75,28.09,27.02,27.05,18222000,27.05
|
||||
2011-08-01,28.88,28.88,27.70,28.07,12825600,28.07
|
||||
2011-07-29,27.62,28.10,27.31,27.68,12182900,27.68
|
||||
2011-07-28,28.22,28.90,28.02,28.10,11029000,28.10
|
||||
2011-07-27,28.90,29.00,28.03,28.14,14552900,28.14
|
||||
2011-07-26,29.67,29.70,28.96,29.09,9268400,29.09
|
||||
2011-07-25,29.53,29.84,29.39,29.50,8690700,29.50
|
||||
2011-07-22,29.97,30.29,29.88,30.10,7820100,30.10
|
||||
2011-07-21,29.39,30.02,29.16,29.96,12231600,29.96
|
||||
2011-07-20,29.45,29.48,29.05,29.24,6997900,29.24
|
||||
2011-07-19,29.20,29.49,28.76,29.33,10283400,29.33
|
||||
2011-07-18,29.57,29.65,28.62,29.10,13850000,29.10
|
||||
2011-07-15,30.39,30.62,29.52,29.76,10400000,29.76
|
||||
2011-07-14,30.85,30.97,30.03,30.10,10800200,30.10
|
||||
2011-07-13,30.87,31.30,30.61,30.75,8093200,30.75
|
||||
2011-07-12,30.24,30.94,30.02,30.68,10536600,30.68
|
||||
2011-07-11,30.98,31.20,30.55,30.75,8376600,30.75
|
||||
2011-07-08,31.25,31.70,31.16,31.58,9872100,31.58
|
||||
2011-07-07,31.79,32.08,31.65,31.80,13339400,31.80
|
||||
2011-07-06,31.37,31.68,31.09,31.19,14223700,31.19
|
||||
2011-07-05,30.87,31.36,30.58,30.86,8836900,30.86
|
||||
2011-07-01,30.35,30.86,29.92,30.58,18098400,30.58
|
||||
2011-06-30,30.30,30.56,30.22,30.36,18656100,30.36
|
||||
2011-06-29,30.79,30.79,30.25,30.30,12716100,30.30
|
||||
2011-06-28,30.21,30.79,30.18,30.50,12698300,30.50
|
||||
2011-06-27,29.79,30.46,29.60,30.26,15348000,30.26
|
||||
2011-06-24,30.15,30.30,29.66,29.92,50062300,29.92
|
||||
2011-06-23,29.53,30.20,29.32,30.14,13780400,30.14
|
||||
2011-06-22,29.62,30.18,29.50,29.97,15520300,29.97
|
||||
2011-06-21,29.51,30.00,29.43,29.59,12648300,29.59
|
||||
2011-06-20,28.87,29.60,28.77,29.52,9038000,29.52
|
||||
2011-06-17,28.73,29.06,28.57,29.00,16732300,29.00
|
||||
2011-06-16,28.70,28.99,28.17,28.59,14079400,28.59
|
||||
2011-06-15,28.77,29.11,28.64,28.95,11669900,28.95
|
||||
2011-06-14,28.92,29.49,28.86,29.11,10948400,29.11
|
||||
2011-06-13,28.90,29.08,28.29,28.59,9791900,28.59
|
||||
2011-06-10,29.30,29.30,28.65,28.85,11734000,28.85
|
||||
2011-06-09,29.20,29.58,28.91,29.45,13593600,29.45
|
||||
2011-06-08,28.52,29.34,28.40,28.86,16534000,28.86
|
||||
2011-06-07,28.89,29.04,28.39,28.78,15237900,28.78
|
||||
2011-06-06,29.02,29.41,28.55,28.56,13416100,28.56
|
||||
2011-06-03,29.28,29.56,28.90,29.12,21968400,29.12
|
||||
2011-06-02,30.33,30.56,29.40,29.60,22440900,29.60
|
||||
2011-06-01,31.70,31.70,30.15,30.23,19134400,30.23
|
||||
2011-05-31,31.44,31.87,31.13,31.81,22938900,31.81
|
||||
2011-05-27,30.77,31.48,30.59,31.28,9114200,31.28
|
||||
2011-05-26,31.08,31.10,30.52,30.68,12244400,30.68
|
||||
2011-05-25,30.56,31.38,30.50,31.27,14390800,31.27
|
||||
2011-05-24,31.13,31.20,30.50,30.83,8779100,30.83
|
||||
2011-05-23,30.68,31.16,30.50,30.96,8971700,30.96
|
||||
2011-05-20,31.36,31.50,31.10,31.18,7224100,31.18
|
||||
2011-05-19,31.53,31.79,31.31,31.47,9575300,31.47
|
||||
2011-05-18,31.07,31.62,31.00,31.52,9949400,31.52
|
||||
2011-05-17,31.06,31.38,30.83,31.10,11191200,31.10
|
||||
2011-05-16,31.25,31.50,31.08,31.10,7291700,31.10
|
||||
2011-05-13,31.46,31.54,30.85,31.07,10837200,31.07
|
||||
2011-05-12,31.07,31.60,30.93,31.42,14128200,31.42
|
||||
2011-05-11,31.57,31.86,31.11,31.30,9073300,31.30
|
||||
2011-05-10,31.47,31.64,31.33,31.61,7841900,31.61
|
||||
2011-05-09,31.74,32.06,31.36,31.39,10716400,31.39
|
||||
2011-05-06,32.50,32.60,31.84,31.91,12801800,31.91
|
||||
2011-05-05,32.06,32.68,31.49,32.02,26623400,32.02
|
||||
2011-05-04,33.16,33.47,32.71,33.04,20492000,33.04
|
||||
2011-05-03,32.38,33.20,32.36,32.99,29894800,32.99
|
||||
2011-05-02,32.41,32.50,31.92,32.18,11014500,32.18
|
||||
2011-04-29,31.99,32.58,31.91,32.09,13774600,32.09
|
||||
2011-04-28,31.76,32.10,31.48,31.91,15810100,31.91
|
||||
2011-04-27,31.47,31.79,31.28,31.78,14945600,31.78
|
||||
2011-04-26,31.39,31.51,30.96,31.27,15700000,31.27
|
||||
2011-04-25,31.00,31.19,30.32,31.14,15442500,31.14
|
||||
2011-04-21,30.05,31.00,30.01,30.95,18920800,30.95
|
||||
2011-04-20,29.76,30.38,29.42,29.93,22038100,29.93
|
||||
2011-04-19,29.81,29.91,29.17,29.59,19914800,29.59
|
||||
2011-04-18,30.06,30.34,29.90,29.97,12745600,29.97
|
||||
2011-04-15,30.59,30.72,30.18,30.24,9882800,30.24
|
||||
2011-04-14,30.65,30.86,30.35,30.58,9044800,30.58
|
||||
2011-04-13,31.23,31.32,30.59,30.86,13781100,30.86
|
||||
2011-04-12,30.40,31.34,30.10,31.15,19648600,31.15
|
||||
2011-04-11,31.34,31.45,30.55,30.77,15179100,30.77
|
||||
2011-04-08,32.40,32.75,31.33,31.52,16057700,31.52
|
||||
2011-04-07,32.84,32.84,32.07,32.31,11242400,32.31
|
||||
2011-04-06,33.00,33.28,32.52,32.87,8160100,32.87
|
||||
2011-04-05,32.32,32.87,32.10,32.87,10106900,32.87
|
||||
2011-04-04,32.50,32.72,32.20,32.39,12331400,32.39
|
||||
2011-04-01,31.39,32.63,30.84,32.41,29883100,32.41
|
||||
2011-03-31,31.40,31.55,31.00,31.03,8976600,31.03
|
||||
2011-03-30,31.16,31.64,31.04,31.55,7657000,31.55
|
||||
2011-03-29,30.93,31.17,30.68,31.10,9756700,31.10
|
||||
2011-03-28,31.58,31.58,30.85,30.85,10302600,30.85
|
||||
2011-03-25,31.49,31.70,31.09,31.47,15242300,31.47
|
||||
2011-03-24,31.32,31.60,31.24,31.39,15743100,31.39
|
||||
2011-03-23,30.60,31.28,30.20,31.16,23016000,31.16
|
||||
2011-03-22,31.28,31.35,30.51,30.74,16982100,30.74
|
||||
2011-03-21,32.24,32.30,31.23,31.28,13196500,31.28
|
||||
2011-03-18,31.74,31.95,31.42,31.85,9861200,31.85
|
||||
2011-03-17,32.18,32.39,31.33,31.44,11025600,31.44
|
||||
2011-03-16,32.42,32.53,31.40,31.78,14148900,31.78
|
||||
2011-03-15,30.98,32.49,30.65,32.35,21471300,32.35
|
||||
2011-03-14,32.14,32.30,31.43,31.59,9976500,31.59
|
||||
2011-03-11,31.25,32.06,31.24,31.93,14024000,31.93
|
||||
2011-03-10,31.47,31.78,30.95,31.42,38333600,31.42
|
||||
2011-03-09,32.74,32.76,32.10,32.25,11225600,32.25
|
||||
2011-03-08,31.74,32.82,31.69,32.72,14509400,32.72
|
||||
2011-03-07,32.35,32.50,31.52,31.70,15077100,31.70
|
||||
2011-03-04,33.07,33.08,32.01,32.39,24240900,32.39
|
||||
2011-03-03,33.03,33.17,32.65,33.03,17614800,33.03
|
||||
2011-03-02,32.90,33.17,32.59,32.88,14306700,32.88
|
||||
2011-03-01,33.69,33.75,32.43,32.95,27321600,32.95
|
||||
2011-02-28,33.49,33.74,32.86,33.53,15886400,33.53
|
||||
2011-02-25,33.67,34.20,33.05,33.25,29017800,33.25
|
||||
2011-02-24,34.90,35.00,32.05,33.02,63562800,33.02
|
||||
2011-02-23,35.85,35.94,33.80,34.59,28192500,34.59
|
||||
2011-02-22,35.86,36.15,35.45,35.77,13937400,35.77
|
||||
2011-02-18,36.42,36.76,36.38,36.51,6815800,36.51
|
||||
2011-02-17,36.55,36.70,36.30,36.37,7463000,36.37
|
||||
2011-02-16,36.11,36.84,36.02,36.75,8680900,36.75
|
||||
2011-02-15,36.19,36.41,35.80,36.11,10336500,36.11
|
||||
2011-02-14,36.55,36.56,35.47,36.29,7121200,36.29
|
||||
2011-02-11,35.76,36.57,35.55,36.45,13508300,36.45
|
||||
2011-02-10,36.17,36.64,35.52,35.88,11542400,35.88
|
||||
2011-02-09,36.82,36.91,36.28,36.41,6399300,36.41
|
||||
2011-02-08,36.97,37.05,36.40,36.89,6243000,36.89
|
||||
2011-02-07,36.95,37.09,36.61,36.70,7328800,36.70
|
||||
2011-02-04,36.25,36.73,35.89,36.59,11115000,36.59
|
||||
2011-02-03,35.97,36.06,35.13,36.06,19350000,36.06
|
||||
2011-02-02,36.46,36.56,35.58,35.68,17434700,35.68
|
||||
2011-02-01,36.93,37.23,36.13,36.45,28091600,36.45
|
||||
2011-01-31,36.89,37.05,35.89,36.49,13954800,36.49
|
||||
2011-01-28,38.00,38.02,36.01,36.60,37134600,36.60
|
||||
2011-01-27,38.20,38.95,38.03,38.67,13950000,38.67
|
||||
2011-01-26,38.75,38.91,37.86,37.89,11534600,37.89
|
||||
2011-01-25,37.85,38.49,37.71,38.40,12318700,38.40
|
||||
2011-01-24,37.71,37.86,37.03,37.64,12202800,37.64
|
||||
2011-01-21,37.33,37.85,36.82,37.24,9548500,37.24
|
||||
2011-01-20,37.11,37.29,36.27,37.18,15844900,37.18
|
||||
2011-01-19,37.78,37.93,37.26,37.40,12667700,37.40
|
||||
2011-01-18,38.05,38.33,37.32,38.03,8918200,38.03
|
||||
2011-01-14,38.18,38.47,38.04,38.20,5894200,38.20
|
||||
2011-01-13,38.66,38.71,38.11,38.27,11358300,38.27
|
||||
2011-01-12,38.95,39.37,38.37,38.62,16773900,38.62
|
||||
2011-01-11,38.66,39.43,38.51,38.75,14856500,38.75
|
||||
2011-01-10,39.34,39.36,38.44,38.56,18341600,38.56
|
||||
2011-01-07,38.84,39.33,38.51,38.98,19901100,38.98
|
||||
2011-01-06,38.24,39.48,38.07,38.90,38556900,38.90
|
||||
2011-01-05,37.47,38.30,37.47,38.07,22503900,38.07
|
||||
2011-01-04,37.10,37.99,36.68,37.90,32363400,37.90
|
||||
2011-01-03,37.32,38.00,37.03,37.06,24874900,37.06
|
||||
2010-12-31,36.84,36.96,36.57,36.86,6163900,36.86
|
||||
2010-12-30,36.10,36.98,36.02,36.82,16980800,36.82
|
||||
2010-12-29,35.47,36.30,35.25,36.02,20960800,36.02
|
||||
2010-12-28,35.38,35.67,35.07,35.32,23489000,35.32
|
||||
2010-12-27,34.41,34.89,34.19,34.60,7368300,34.60
|
||||
2010-12-23,34.67,35.52,34.62,34.81,20529200,34.81
|
||||
2010-12-22,33.72,34.95,33.53,34.92,20935100,34.92
|
||||
2010-12-21,33.86,33.94,33.72,33.85,9012400,33.85
|
||||
2010-12-20,33.91,34.05,33.74,33.76,12476400,33.76
|
||||
2010-12-17,33.53,34.00,33.19,34.00,35681600,34.00
|
||||
2010-12-16,33.57,33.86,33.56,33.61,9885700,33.61
|
||||
2010-12-15,33.81,34.01,33.61,33.61,10182700,33.61
|
||||
2010-12-14,33.73,33.92,33.45,33.89,15165600,33.89
|
||||
2010-12-13,33.96,34.05,33.70,33.80,11029300,33.80
|
||||
2010-12-10,33.85,33.99,33.53,33.81,11741700,33.81
|
||||
2010-12-09,34.36,34.43,33.62,33.74,18402200,33.74
|
||||
2010-12-08,34.61,34.73,34.33,34.45,12603900,34.45
|
||||
2010-12-07,34.75,34.89,34.46,34.68,20823000,34.68
|
||||
2010-12-06,34.48,34.78,34.41,34.48,11676500,34.48
|
||||
2010-12-03,34.55,34.60,33.97,34.55,19395200,34.55
|
||||
2010-12-02,34.92,34.98,34.51,34.68,23196100,34.68
|
||||
2010-12-01,34.65,34.95,34.42,34.78,34633200,34.78
|
||||
2010-11-30,33.53,34.25,33.36,34.20,57476900,34.20
|
||||
2010-11-29,33.80,33.81,33.07,33.80,27776900,33.80
|
||||
2010-11-26,33.41,33.81,33.21,33.80,12301200,33.80
|
||||
2010-11-24,33.73,33.80,33.22,33.48,26138000,33.48
|
||||
2010-11-23,33.95,33.99,33.19,33.25,31170200,33.25
|
||||
2010-11-22,34.20,34.48,33.81,34.08,36650600,34.08
|
||||
2010-11-19,34.15,34.50,33.11,34.26,107842000,34.26
|
||||
2010-11-18,35.00,35.99,33.89,34.19,457044300,34.19
|
||||
6977
data/HD.txt
Normal file
6977
data/HD.txt
Normal file
File diff suppressed because it is too large
Load Diff
12662
data/HPQ.txt
Normal file
12662
data/HPQ.txt
Normal file
File diff suppressed because it is too large
Load Diff
12662
data/IBM.txt
Normal file
12662
data/IBM.txt
Normal file
File diff suppressed because it is too large
Load Diff
10675
data/JNJ.txt
Normal file
10675
data/JNJ.txt
Normal file
File diff suppressed because it is too large
Load Diff
7138
data/JPM.txt
Normal file
7138
data/JPM.txt
Normal file
File diff suppressed because it is too large
Load Diff
2726
data/KFT.txt
Normal file
2726
data/KFT.txt
Normal file
File diff suppressed because it is too large
Load Diff
8906
data/KR.txt
Normal file
8906
data/KR.txt
Normal file
File diff suppressed because it is too large
Load Diff
6759
data/LOW.txt
Normal file
6759
data/LOW.txt
Normal file
File diff suppressed because it is too large
Load Diff
4383
data/MCK.txt
Normal file
4383
data/MCK.txt
Normal file
File diff suppressed because it is too large
Load Diff
2969
data/MET.txt
Normal file
2969
data/MET.txt
Normal file
File diff suppressed because it is too large
Load Diff
2181
data/MHS.txt
Normal file
2181
data/MHS.txt
Normal file
File diff suppressed because it is too large
Load Diff
10669
data/MRO.txt
Normal file
10669
data/MRO.txt
Normal file
File diff suppressed because it is too large
Load Diff
6583
data/MSFT.txt
Normal file
6583
data/MSFT.txt
Normal file
File diff suppressed because it is too large
Load Diff
8905
data/PEP.txt
Normal file
8905
data/PEP.txt
Normal file
File diff suppressed because it is too large
Load Diff
7643
data/PFE.txt
Normal file
7643
data/PFE.txt
Normal file
File diff suppressed because it is too large
Load Diff
10675
data/PG.txt
Normal file
10675
data/PG.txt
Normal file
File diff suppressed because it is too large
Load Diff
1503
data/SNPAX.txt
Normal file
1503
data/SNPAX.txt
Normal file
File diff suppressed because it is too large
Load Diff
6999
data/T.txt
Normal file
6999
data/T.txt
Normal file
File diff suppressed because it is too large
Load Diff
7325
data/TGT.txt
Normal file
7325
data/TGT.txt
Normal file
File diff suppressed because it is too large
Load Diff
5562
data/UNH.txt
Normal file
5562
data/UNH.txt
Normal file
File diff suppressed because it is too large
Load Diff
3122
data/UPS.txt
Normal file
3122
data/UPS.txt
Normal file
File diff suppressed because it is too large
Load Diff
10675
data/UTX.txt
Normal file
10675
data/UTX.txt
Normal file
File diff suppressed because it is too large
Load Diff
7636
data/VLO.txt
Normal file
7636
data/VLO.txt
Normal file
File diff suppressed because it is too large
Load Diff
7165
data/VZ.txt
Normal file
7165
data/VZ.txt
Normal file
File diff suppressed because it is too large
Load Diff
6759
data/WAG.txt
Normal file
6759
data/WAG.txt
Normal file
File diff suppressed because it is too large
Load Diff
6925
data/WFC.txt
Normal file
6925
data/WFC.txt
Normal file
File diff suppressed because it is too large
Load Diff
2635
data/WLP.txt
Normal file
2635
data/WLP.txt
Normal file
File diff suppressed because it is too large
Load Diff
10003
data/WMT.txt
Normal file
10003
data/WMT.txt
Normal file
File diff suppressed because it is too large
Load Diff
10675
data/XOM.txt
Normal file
10675
data/XOM.txt
Normal file
File diff suppressed because it is too large
Load Diff
68
data/getStockHistory
Normal file
68
data/getStockHistory
Normal file
@@ -0,0 +1,68 @@
|
||||
#!/bin/bash
|
||||
|
||||
# NAME
|
||||
# getStockHistory -- Downloads historic stock prices for a given company stock.
|
||||
#
|
||||
# SYNOPSIS
|
||||
# getStockHistory symbol [optional saveLocation]
|
||||
# getStockHistory -f file [optional saveLocation]
|
||||
#
|
||||
# DESCRIPTION
|
||||
# The getSockHistory command attempts to download complete stock history for a given company or
|
||||
# for list of companies in a file.
|
||||
#
|
||||
# OPTIONS
|
||||
# -f Use a file to have a list of stock symbols to download. Each stock symbol should be on a
|
||||
# differnt line in the file.
|
||||
#
|
||||
#
|
||||
# EXAMPLES
|
||||
# The following downloads Apple's stock history.
|
||||
# getStockHistory AAPL
|
||||
#
|
||||
# BUGS
|
||||
#
|
||||
# Written by: Rowland O'Flaherty (rowland.oflaherty@gmail.com)
|
||||
# Created on: 02/08/10/
|
||||
|
||||
function getHistory() {
|
||||
stockSymbol=$1
|
||||
if [[ -z $stockSymbol ]]
|
||||
then
|
||||
echo "Invalid stock symbol"
|
||||
exit 1
|
||||
fi
|
||||
stockSymbol=$(echo $stockSymbol | tr '[:lower:]' '[:upper:]')
|
||||
saveLocation=$2
|
||||
wget -q -O ${stockSymbol}.csv "http://ichart.finance.yahoo.com/table.csv?s=${stockSymbol}&ignore=.csv"
|
||||
sed '1s/\ /_/g' ${stockSymbol}.csv > ${stockSymbol}.tmp
|
||||
mv ${stockSymbol}.tmp ${stockSymbol}.csv
|
||||
mv ${stockSymbol}.csv ${saveLocation}
|
||||
}
|
||||
|
||||
fOpition=false
|
||||
while getopts :f OPTION
|
||||
do
|
||||
case $OPTION in
|
||||
f)
|
||||
fOpition=true
|
||||
;;
|
||||
'?')
|
||||
echo "Invalid option $OPTARG"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
if [[ "$fOpition" = "true" ]]
|
||||
then
|
||||
fileLocation=$2
|
||||
saveLocation=$3
|
||||
saveLocation=${saveLocation:=$HOME/Downloads}
|
||||
for aSym in $(cat $fileLocation)
|
||||
do
|
||||
eval "getHistory $aSym $saveLocation"
|
||||
done
|
||||
else
|
||||
saveLocation=$2
|
||||
saveLocation=${saveLocation:=$HOME/Downloads}
|
||||
eval "getHistory $1 $saveLocation"
|
||||
fi
|
||||
50
data/stock_symbols.csv
Normal file
50
data/stock_symbols.csv
Normal file
@@ -0,0 +1,50 @@
|
||||
1,Wal-Mart Stores,WMT
|
||||
2,Exxon Mobil,XOM
|
||||
3,Chevron,CVX
|
||||
4,ConocoPhillips,COP
|
||||
5,Fannie Mae,FNMA
|
||||
6,General Electric,GE
|
||||
7,Berkshire Hathaway,BRKA
|
||||
8,General Motors,GM
|
||||
9,Bank of America Corp.,BAC
|
||||
10,Ford Motor,F
|
||||
11,Hewlett-Packard,HPQ
|
||||
12,AT&T,T
|
||||
13,J.P. Morgan Chase & Co.,JPM
|
||||
14,Citigroup,C
|
||||
15,McKesson,MCK
|
||||
16,Verizon Communications,VZ
|
||||
17,American International Group,AIG
|
||||
18,International Business Machines,IBM
|
||||
19,Cardinal Health,CAH
|
||||
20,Freddie Mac,FMCC
|
||||
21,CVS Caremark,CVS
|
||||
22,UnitedHealth Group,UNH
|
||||
23,Wells Fargo,WFC
|
||||
24,Valero Energy,VLO
|
||||
25,Kroger,KR
|
||||
26,Procter & Gamble,PG
|
||||
27,AmerisourceBergen,ABC
|
||||
28,Costco Wholesale,COST
|
||||
29,Marathon Oil,MRO
|
||||
30,Home Depot,HD
|
||||
31,Pfizer,PFE
|
||||
32,Walgreen,WAG
|
||||
33,Target,TGT
|
||||
34,Medco Health Solutions,MHS
|
||||
35,Apple,AAPL
|
||||
36,Boeing,BA
|
||||
37,State Farm Insurance Cos.,SNPAX
|
||||
38,Microsoft,MSFT
|
||||
39,Archer Daniels Midland,ADM
|
||||
40,Johnson & Johnson,JNJ
|
||||
41,Dell,DELL
|
||||
42,WellPoint,WLP
|
||||
43,PepsiCo,PEP
|
||||
44,United Technologies,UTX
|
||||
45,Dow Chemical,DOW
|
||||
46,MetLife,MET
|
||||
47,Best Buy,BBY
|
||||
48,United Parcel Service,UPS
|
||||
49,Kraft Foods,KFT
|
||||
50,Lowe's,LOW
|
||||
|
@@ -35,10 +35,10 @@ public class HeadlinePuller {
|
||||
private static final int NO_ARGS = 5;
|
||||
private static final int STOCK_SYMBOL_CSV_NOT_FOUND = 6;
|
||||
|
||||
@Autowired
|
||||
HeadlineService mySQLHeadlineServiceImpl;
|
||||
@Autowired
|
||||
HeadlineService yahooHeadlineServiceImpl;
|
||||
//@Autowired
|
||||
//HeadlineService mySQLHeadlineServiceImpl;
|
||||
//@Autowired
|
||||
//HeadlineService yahooHeadlineServiceImpl;
|
||||
|
||||
private static void printUsage() {
|
||||
System.out
|
||||
@@ -115,10 +115,10 @@ public class HeadlinePuller {
|
||||
for (calendar.setTime(startDate); (today = calendar.getTime())
|
||||
.compareTo(endDate) <= 0; calendar
|
||||
.add(Calendar.DATE, 1)) {
|
||||
List<Headline> headlines = headlinePuller.pullHeadlines(
|
||||
company.getStockSymbol(), today);
|
||||
int[] updates = headlinePuller.mySQLHeadlineServiceImpl.insertHeadlines(headlines);
|
||||
System.out.println(updates.length + " rows updated");
|
||||
//List<Headline> headlines = headlinePuller.pullHeadlines(
|
||||
// company.getStockSymbol(), today);
|
||||
//int[] updates = headlinePuller.mySQLHeadlineServiceImpl.insertHeadlines(headlines);
|
||||
//System.out.println(updates.length + " rows updated");
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException fnfe) {
|
||||
@@ -132,12 +132,12 @@ public class HeadlinePuller {
|
||||
}
|
||||
}
|
||||
|
||||
private List<Headline> pullHeadlines(String stockSymbol, Date date) {
|
||||
List<Headline> headlines = yahooHeadlineServiceImpl.getHeadlines(
|
||||
stockSymbol, date);
|
||||
System.out.println("Pulled " + headlines.size() + " headlines for " + stockSymbol + " on " + date);
|
||||
return headlines;
|
||||
}
|
||||
//private List<Headline> pullHeadlines(String stockSymbol, Date date) {
|
||||
//List<Headline> headlines = yahooHeadlineServiceImpl.getHeadlines(
|
||||
// stockSymbol, date);
|
||||
//System.out.println("Pulled " + headlines.size() + " headlines for " + stockSymbol + " on " + date);
|
||||
//return headlines;
|
||||
//}
|
||||
|
||||
private List<Company> getFortune50(File csvFile)
|
||||
throws FileNotFoundException, IOException {
|
||||
|
||||
@@ -45,39 +45,30 @@ public class ModelGenerator {
|
||||
|
||||
Date startDate = null;
|
||||
Date endDate = null;
|
||||
Date valStart = null;
|
||||
Date valEnd = null;
|
||||
try {
|
||||
startDate = dateFmt.parse("2012-01-01");
|
||||
endDate = dateFmt.parse("2012-03-31");
|
||||
valStart = dateFmt.parse("2012-04-01");
|
||||
valEnd = dateFmt.parse("2012-04-14");
|
||||
endDate = dateFmt.parse("2012-04-14");
|
||||
} catch (ParseException pe) {
|
||||
System.exit(INVALID_DATE);
|
||||
}
|
||||
|
||||
List<Headline> trainingSet = new ArrayList<Headline>();
|
||||
//actually, this is the TEST dataset
|
||||
List<Headline> testSet = new ArrayList<Headline>();
|
||||
|
||||
try {
|
||||
List<Company> fortune50 = modelGenerator
|
||||
.getFortune50(stockSymbolsCSV);
|
||||
for (Company company : fortune50) {
|
||||
System.out.println("Getting headlines for Fortune 50 company #"
|
||||
+ company.getId() + " (" + company.getName() + ")...");
|
||||
List<Headline> trainingSet = modelGenerator.mySQLHeadlineServiceImpl.getHeadlines(company.getStockSymbol(), startDate, endDate);
|
||||
System.out.println("Pulled " + trainingSet.size() + " headlines for "
|
||||
List<Headline> coTrainingSet = modelGenerator.mySQLHeadlineServiceImpl.getHeadlines(company.getStockSymbol(), startDate, endDate, 1);
|
||||
System.out.println("Pulled " + coTrainingSet.size() + " headlines for "
|
||||
+ company.getStockSymbol() + " from " + startDate + " to " + endDate);
|
||||
List<Headline> validationSet = modelGenerator.mySQLHeadlineServiceImpl.getHeadlines(company.getStockSymbol(), valStart, valEnd);
|
||||
List<Headline> coTestSet = modelGenerator.mySQLHeadlineServiceImpl.getHeadlines(company.getStockSymbol(), startDate, endDate, 2);
|
||||
|
||||
if (trainingSet.size() == 0) {
|
||||
System.out.println("Training dataset contains 0 headlines for " + company.getName() + ", skipping model generation.");
|
||||
continue;
|
||||
}
|
||||
if (validationSet.size() == 0) {
|
||||
System.out.println("Validation dataset contains 0 headlines for " + company.getName() + ", skipping model generation.");
|
||||
continue;
|
||||
}
|
||||
|
||||
modelGenerator.ngramModel.reportModel(trainingSet, validationSet);
|
||||
System.out.println("Finished " + company.getId() + " / 50");
|
||||
trainingSet.addAll(coTrainingSet);
|
||||
testSet.addAll(coTestSet);
|
||||
}
|
||||
} catch (FileNotFoundException fnfe) {
|
||||
System.out.println("Stock symbol CSV file does not exist: "
|
||||
@@ -88,6 +79,8 @@ public class ModelGenerator {
|
||||
+ stockSymbolsCSV);
|
||||
System.exit(IO_EXCEPTION);
|
||||
}
|
||||
|
||||
//modelGenerator.ngramModel.reportModel(trainingSet, testSet);
|
||||
}
|
||||
|
||||
private List<Company> getFortune50(File csvFile)
|
||||
|
||||
180
src/net/woodyfolsom/cs6601/p3/PricePoller.java
Normal file
180
src/net/woodyfolsom/cs6601/p3/PricePoller.java
Normal file
@@ -0,0 +1,180 @@
|
||||
package net.woodyfolsom.cs6601.p3;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.context.ApplicationContext;
|
||||
import org.springframework.context.support.FileSystemXmlApplicationContext;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import net.woodyfolsom.cs6601.p3.domain.Company;
|
||||
import net.woodyfolsom.cs6601.p3.domain.Headline;
|
||||
import net.woodyfolsom.cs6601.p3.domain.StockPrice;
|
||||
import net.woodyfolsom.cs6601.p3.ngram.NGramModel;
|
||||
import net.woodyfolsom.cs6601.p3.svc.HeadlineService;
|
||||
|
||||
@Component
|
||||
public class PricePoller {
|
||||
private static final File stockSymbolsCSV = new File("stock_symbols.csv");
|
||||
|
||||
private static final int INVALID_DATE = 1;
|
||||
private static final int IO_EXCEPTION = 2;
|
||||
private static final int STOCK_SYMBOL_CSV_NOT_FOUND = 3;
|
||||
|
||||
@Autowired
|
||||
HeadlineService mySQLHeadlineServiceImpl;
|
||||
|
||||
private NGramModel ngramModel = new NGramModel();
|
||||
|
||||
public static void main(String... args) {
|
||||
|
||||
ApplicationContext context = new FileSystemXmlApplicationContext(
|
||||
new String[] { "AppContext.xml" });
|
||||
PricePoller modelGenerator = context.getBean(PricePoller.class);
|
||||
DateFormat dateFmt = new SimpleDateFormat("yyyy-MM-dd");
|
||||
|
||||
Date startDate = null;
|
||||
Date endDate = null;
|
||||
try {
|
||||
startDate = dateFmt.parse("2012-01-01");
|
||||
endDate = dateFmt.parse("2012-04-14");
|
||||
} catch (ParseException pe) {
|
||||
System.exit(INVALID_DATE);
|
||||
}
|
||||
|
||||
List<Headline> trainingSet = new ArrayList<Headline>();
|
||||
//actually, this is the TEST dataset
|
||||
List<Headline> testSet = new ArrayList<Headline>();
|
||||
Map<String,Map<Date,StockPrice>> stockTrends = new HashMap<String,Map<Date,StockPrice>>();
|
||||
try {
|
||||
List<Company> fortune50 = modelGenerator
|
||||
.getFortune50(stockSymbolsCSV);
|
||||
for (Company company : fortune50) {
|
||||
stockTrends.put(company.getStockSymbol(), new HashMap<Date,StockPrice>());
|
||||
System.out.println("Polling price data for " + company.getName());
|
||||
File stockPriceFile = new File("data" + File.separator + company.getStockSymbol() + ".txt");
|
||||
BufferedReader buf;
|
||||
try {
|
||||
buf = new BufferedReader(new InputStreamReader(new FileInputStream(stockPriceFile)));
|
||||
} catch (FileNotFoundException fnfe) {
|
||||
System.out.println("Unable to find historical stock data file for: " + company.getStockSymbol());
|
||||
continue;
|
||||
}
|
||||
String line;
|
||||
int linesRead = 0;
|
||||
try {
|
||||
while ((line = buf.readLine()) != null) {
|
||||
linesRead++;
|
||||
if (linesRead == 1) {
|
||||
continue; // header line
|
||||
}
|
||||
String[] fields = line.trim().split(",");
|
||||
Date date;
|
||||
try {
|
||||
date = dateFmt.parse(fields[0]);
|
||||
} catch (ParseException pe) {
|
||||
System.out.println("Error parsing date: " + fields[0]);
|
||||
continue;
|
||||
}
|
||||
if (date.compareTo(endDate) > 0) {
|
||||
continue;
|
||||
}
|
||||
if (date.compareTo(startDate) < 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
double open;
|
||||
double high;
|
||||
double low;
|
||||
double close;
|
||||
long volume;
|
||||
double adjClose;
|
||||
|
||||
try {
|
||||
open = Double.parseDouble(fields[1]);
|
||||
high = Double.parseDouble(fields[2]);
|
||||
low = Double.parseDouble(fields[3]);
|
||||
close = Double.parseDouble(fields[4]);
|
||||
volume = Long.parseLong(fields[5]);
|
||||
adjClose = Double.parseDouble(fields[6]);
|
||||
} catch (NumberFormatException nfe) {
|
||||
System.out.println(nfe.getMessage());
|
||||
continue;
|
||||
}
|
||||
|
||||
StockPrice stockPrice = new StockPrice(date,open,high,low,close,volume,adjClose);
|
||||
|
||||
stockTrends.get(company.getStockSymbol()).put(date,stockPrice);
|
||||
}
|
||||
} catch (IOException ioe) {
|
||||
System.err.println(ioe.getMessage());
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
buf.close();
|
||||
} catch (IOException ioe) {
|
||||
System.err.println(ioe.getMessage());
|
||||
}
|
||||
}
|
||||
for (Company company : fortune50) {
|
||||
System.out.println("Getting headlines for Fortune 50 company #"
|
||||
+ company.getId() + " (" + company.getName() + ")...");
|
||||
List<Headline> coTrainingSet = modelGenerator.mySQLHeadlineServiceImpl.getHeadlines(company.getStockSymbol(), startDate, endDate, 1);
|
||||
System.out.println("Pulled " + coTrainingSet.size() + " TRAINING headlines for "
|
||||
+ company.getStockSymbol() + " from " + startDate + " to " + endDate);
|
||||
List<Headline> coTestSet = modelGenerator.mySQLHeadlineServiceImpl.getHeadlines(company.getStockSymbol(), startDate, endDate, 2);
|
||||
System.out.println("Pulled " + coTestSet.size() + " TEST headlines for "
|
||||
+ company.getStockSymbol() + " from " + startDate + " to " + endDate);
|
||||
|
||||
trainingSet.addAll(coTrainingSet);
|
||||
testSet.addAll(coTestSet);
|
||||
}
|
||||
} catch (FileNotFoundException fnfe) {
|
||||
System.out.println("Stock symbol CSV file does not exist: "
|
||||
+ stockSymbolsCSV);
|
||||
System.exit(STOCK_SYMBOL_CSV_NOT_FOUND);
|
||||
} catch (IOException ioe) {
|
||||
System.out.println("Stock symbol CSV file does not exist: "
|
||||
+ stockSymbolsCSV);
|
||||
System.exit(IO_EXCEPTION);
|
||||
}
|
||||
|
||||
modelGenerator.ngramModel.reportModel(trainingSet, testSet, stockTrends);
|
||||
}
|
||||
|
||||
private List<Company> getFortune50(File csvFile)
|
||||
throws FileNotFoundException, IOException {
|
||||
List<Company> fortune50 = new ArrayList<Company>();
|
||||
FileInputStream fis = new FileInputStream(csvFile);
|
||||
InputStreamReader reader = new InputStreamReader(fis);
|
||||
BufferedReader buf = new BufferedReader(reader);
|
||||
String csvline = null;
|
||||
while ((csvline = buf.readLine()) != null) {
|
||||
if (csvline.length() == 0) {
|
||||
continue;
|
||||
}
|
||||
String[] fields = csvline.split(",");
|
||||
if (fields.length != 3) {
|
||||
throw new RuntimeException(
|
||||
"Badly formatted csv file name (3 values expected): "
|
||||
+ csvline);
|
||||
}
|
||||
int id = Integer.valueOf(fields[0]);
|
||||
fortune50.add(new Company(id, fields[1], fields[2]));
|
||||
}
|
||||
return fortune50;
|
||||
}
|
||||
}
|
||||
25
src/net/woodyfolsom/cs6601/p3/StockUtil.java
Normal file
25
src/net/woodyfolsom/cs6601/p3/StockUtil.java
Normal file
@@ -0,0 +1,25 @@
|
||||
package net.woodyfolsom.cs6601.p3;
|
||||
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
|
||||
import net.woodyfolsom.cs6601.p3.domain.StockPrice;
|
||||
|
||||
public class StockUtil {
|
||||
public static Date getNextTradingDay(Date date) {
|
||||
Calendar cal = Calendar.getInstance();
|
||||
cal.setTime(date);
|
||||
do {
|
||||
cal.add(Calendar.DATE, 1);
|
||||
} while (cal.get(Calendar.DAY_OF_WEEK) == 1 || cal.get(Calendar.DAY_OF_WEEK) == 7);
|
||||
return cal.getTime();
|
||||
}
|
||||
|
||||
public static double getPercentChange(StockPrice stockPrice) {
|
||||
double close = stockPrice.getClose();
|
||||
double open = stockPrice.getOpen();
|
||||
//If close is 2x open, pct change is 1.0;
|
||||
//If close is 0.9 * open, pct change is -0.10;
|
||||
return 100.0 * ((close / open) - 1.00);
|
||||
}
|
||||
}
|
||||
256
src/net/woodyfolsom/cs6601/p3/ValidationSetCreator.java
Normal file
256
src/net/woodyfolsom/cs6601/p3/ValidationSetCreator.java
Normal file
@@ -0,0 +1,256 @@
|
||||
package net.woodyfolsom.cs6601.p3;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.text.DateFormat;
|
||||
import java.text.DecimalFormat;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.context.ApplicationContext;
|
||||
import org.springframework.context.support.FileSystemXmlApplicationContext;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import net.woodyfolsom.cs6601.p3.domain.Company;
|
||||
import net.woodyfolsom.cs6601.p3.domain.Headline;
|
||||
import net.woodyfolsom.cs6601.p3.domain.StockPrice;
|
||||
import net.woodyfolsom.cs6601.p3.ngram.NGram;
|
||||
import net.woodyfolsom.cs6601.p3.ngram.NGramModel;
|
||||
import net.woodyfolsom.cs6601.p3.svc.HeadlineService;
|
||||
|
||||
@Component
|
||||
public class ValidationSetCreator {
|
||||
private static final File stockSymbolsCSV = new File("stock_symbols.csv");
|
||||
|
||||
private static final int INVALID_DATE = 1;
|
||||
private static final int IO_EXCEPTION = 2;
|
||||
private static final int STOCK_SYMBOL_CSV_NOT_FOUND = 3;
|
||||
|
||||
@Autowired
|
||||
HeadlineService mySQLHeadlineServiceImpl;
|
||||
|
||||
public static void main(String... args) {
|
||||
|
||||
ApplicationContext context = new FileSystemXmlApplicationContext(
|
||||
new String[] { "AppContext.xml" });
|
||||
ValidationSetCreator modelGenerator = context
|
||||
.getBean(ValidationSetCreator.class);
|
||||
DateFormat dateFmt = new SimpleDateFormat("yyyy-MM-dd");
|
||||
|
||||
Date startDate = null;
|
||||
Date endDate = null;
|
||||
try {
|
||||
startDate = dateFmt.parse("2012-01-01");
|
||||
endDate = dateFmt.parse("2012-04-14");
|
||||
} catch (ParseException pe) {
|
||||
System.exit(INVALID_DATE);
|
||||
}
|
||||
|
||||
Map<String, Map<Date, StockPrice>> stockTrends = new HashMap<String, Map<Date, StockPrice>>();
|
||||
try {
|
||||
List<Company> fortune50 = modelGenerator
|
||||
.getFortune50(stockSymbolsCSV);
|
||||
for (Company company : fortune50) {
|
||||
stockTrends.put(company.getStockSymbol(),
|
||||
new HashMap<Date, StockPrice>());
|
||||
System.out.println("Polling price data for "
|
||||
+ company.getName());
|
||||
File stockPriceFile = new File("data" + File.separator
|
||||
+ company.getStockSymbol() + ".txt");
|
||||
BufferedReader buf;
|
||||
try {
|
||||
buf = new BufferedReader(new InputStreamReader(
|
||||
new FileInputStream(stockPriceFile)));
|
||||
} catch (FileNotFoundException fnfe) {
|
||||
System.out
|
||||
.println("Unable to find historical stock data file for: "
|
||||
+ company.getStockSymbol());
|
||||
continue;
|
||||
}
|
||||
String line;
|
||||
int linesRead = 0;
|
||||
try {
|
||||
while ((line = buf.readLine()) != null) {
|
||||
linesRead++;
|
||||
if (linesRead == 1) {
|
||||
continue; // header line
|
||||
}
|
||||
String[] fields = line.trim().split(",");
|
||||
Date date;
|
||||
try {
|
||||
date = dateFmt.parse(fields[0]);
|
||||
} catch (ParseException pe) {
|
||||
System.out.println("Error parsing date: "
|
||||
+ fields[0]);
|
||||
continue;
|
||||
}
|
||||
if (date.compareTo(endDate) > 0) {
|
||||
continue;
|
||||
}
|
||||
if (date.compareTo(startDate) < 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
double open;
|
||||
double high;
|
||||
double low;
|
||||
double close;
|
||||
long volume;
|
||||
double adjClose;
|
||||
|
||||
try {
|
||||
open = Double.parseDouble(fields[1]);
|
||||
high = Double.parseDouble(fields[2]);
|
||||
low = Double.parseDouble(fields[3]);
|
||||
close = Double.parseDouble(fields[4]);
|
||||
volume = Long.parseLong(fields[5]);
|
||||
adjClose = Double.parseDouble(fields[6]);
|
||||
} catch (NumberFormatException nfe) {
|
||||
System.out.println(nfe.getMessage());
|
||||
continue;
|
||||
}
|
||||
|
||||
StockPrice stockPrice = new StockPrice(date, open,
|
||||
high, low, close, volume, adjClose);
|
||||
|
||||
stockTrends.get(company.getStockSymbol()).put(date,
|
||||
stockPrice);
|
||||
}
|
||||
} catch (IOException ioe) {
|
||||
System.err.println(ioe.getMessage());
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
buf.close();
|
||||
} catch (IOException ioe) {
|
||||
System.err.println(ioe.getMessage());
|
||||
}
|
||||
}
|
||||
List<Headline> valSet = new ArrayList<Headline>();
|
||||
for (Company company : fortune50) {
|
||||
System.out.println("Getting headlines for Fortune 50 company #"
|
||||
+ company.getId() + " (" + company.getName() + ")...");
|
||||
List<Headline> coValSet = modelGenerator.mySQLHeadlineServiceImpl
|
||||
.getHeadlines(company.getStockSymbol(), startDate,
|
||||
endDate, 3);
|
||||
System.out.println("Pulled " + coValSet.size()
|
||||
+ " VALIDATION headlines for "
|
||||
+ company.getStockSymbol() + " from " + startDate
|
||||
+ " to " + endDate);
|
||||
|
||||
valSet.addAll(coValSet);
|
||||
}
|
||||
|
||||
File file = new File("validation.txt");
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
|
||||
new FileOutputStream(file)));
|
||||
Map<String,Integer> headlineCount = new HashMap<String,Integer>();
|
||||
Map<String,Double> totalPctChange = new HashMap<String,Double>();
|
||||
|
||||
for (Headline headline : valSet) {
|
||||
String text = headline.getText();
|
||||
Integer count = headlineCount.get(text);
|
||||
Double pctChange = totalPctChange.get(text);
|
||||
Date date = headline.getDate();
|
||||
String stock = headline.getStock();
|
||||
StockPrice stockPrice = stockTrends.get(stock).get(
|
||||
StockUtil.getNextTradingDay(date));
|
||||
double pctPriceChange;
|
||||
if (stockPrice == null) {
|
||||
pctPriceChange = 0.0;
|
||||
} else {
|
||||
pctPriceChange = StockUtil.getPercentChange(stockPrice);
|
||||
}
|
||||
if (count == null) {
|
||||
headlineCount.put(text, 1);
|
||||
totalPctChange.put(text, pctPriceChange);
|
||||
} else {
|
||||
headlineCount.put(text, count+1);
|
||||
totalPctChange.put(text, pctChange + pctPriceChange);
|
||||
}
|
||||
}
|
||||
|
||||
Set<String> processedSet = new HashSet<String>();
|
||||
DecimalFormat decFmt = new DecimalFormat("###0.0000");
|
||||
for (Headline headline : valSet) {
|
||||
String text = headline.getText();
|
||||
if (processedSet.contains(text)) {
|
||||
continue;
|
||||
}
|
||||
processedSet.add(text);
|
||||
int id = headline.getId();
|
||||
String stock = headline.getStock();
|
||||
Date date = headline.getDate();
|
||||
String dateFormatted = dateFmt.format(date);
|
||||
|
||||
double totalPriceChange = totalPctChange.get(text);
|
||||
int totalCount = headlineCount.get(text);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(id);
|
||||
sb.append(", ");
|
||||
sb.append(stock);
|
||||
sb.append(", ");
|
||||
sb.append(dateFormatted);
|
||||
sb.append(", ");
|
||||
sb.append(decFmt.format(totalPriceChange/totalCount));
|
||||
sb.append(", ");
|
||||
|
||||
text = text.replaceAll(
|
||||
"[\'\";:,\\]\\[]", " ");
|
||||
text = text.replaceAll(
|
||||
"[^A-Za-z0-9 ]", "");
|
||||
sb.append(text);
|
||||
|
||||
sb.append("\n");
|
||||
writer.write(sb.toString());
|
||||
}
|
||||
writer.close();
|
||||
} catch (FileNotFoundException fnfe) {
|
||||
System.out.println("Stock symbol CSV file does not exist: "
|
||||
+ stockSymbolsCSV);
|
||||
System.exit(STOCK_SYMBOL_CSV_NOT_FOUND);
|
||||
} catch (IOException ioe) {
|
||||
System.out.println("Stock symbol CSV file does not exist: "
|
||||
+ stockSymbolsCSV);
|
||||
System.exit(IO_EXCEPTION);
|
||||
}
|
||||
}
|
||||
|
||||
private List<Company> getFortune50(File csvFile)
|
||||
throws FileNotFoundException, IOException {
|
||||
List<Company> fortune50 = new ArrayList<Company>();
|
||||
FileInputStream fis = new FileInputStream(csvFile);
|
||||
InputStreamReader reader = new InputStreamReader(fis);
|
||||
BufferedReader buf = new BufferedReader(reader);
|
||||
String csvline = null;
|
||||
while ((csvline = buf.readLine()) != null) {
|
||||
if (csvline.length() == 0) {
|
||||
continue;
|
||||
}
|
||||
String[] fields = csvline.split(",");
|
||||
if (fields.length != 3) {
|
||||
throw new RuntimeException(
|
||||
"Badly formatted csv file name (3 values expected): "
|
||||
+ csvline);
|
||||
}
|
||||
int id = Integer.valueOf(fields[0]);
|
||||
fortune50.add(new Company(id, fields[1], fields[2]));
|
||||
}
|
||||
return fortune50;
|
||||
}
|
||||
}
|
||||
@@ -6,12 +6,14 @@ import java.util.List;
|
||||
import net.woodyfolsom.cs6601.p3.domain.Headline;
|
||||
|
||||
public interface HeadlineDao {
|
||||
|
||||
boolean assignRandomDatasets(int training, int test, int validation);
|
||||
int getCount();
|
||||
int getCount(int dataset);
|
||||
int deleteById(int id);
|
||||
int insert(Headline headline);
|
||||
int[] insertBatch(List<Headline> headlines);
|
||||
|
||||
Headline select(int id);
|
||||
List<Headline> select(String stock, Date date);
|
||||
List<Headline> select(String stock, Date startDate, Date endDate);
|
||||
List<Headline> select(String stock, Date startDate, Date endDate, int dataset);
|
||||
}
|
||||
|
||||
@@ -18,16 +18,36 @@ import net.woodyfolsom.cs6601.p3.domain.Headline;
|
||||
|
||||
@Repository
|
||||
public class HeadlineDaoImpl implements HeadlineDao {
|
||||
private static final String COUNT_ALL_QRY = "SELECT COUNT(1) FROM headlines";
|
||||
private static final String COUNT_DATASET_QRY = "SELECT COUNT(1) FROM headlines where dataset = ?";
|
||||
|
||||
private static final String DELETE_BY_ID_STMT = "DELETE from headlines WHERE id = ?";
|
||||
|
||||
private static final String INSERT_STMT = "INSERT INTO headlines (text, date, stock, dataset) values (?, ?, ?, ?)";
|
||||
|
||||
private static final String SELECT_BY_ID_QRY = "SELECT * from headlines WHERE id = ?";
|
||||
private static final String SELECT_BY_STOCK_QRY = "SELECT * from headlines WHERE stock = ? AND date = ?";
|
||||
private static final String SELECT_BY_DATE_RANGE_QRY = "SELECT * from headlines WHERE stock = ? AND date >= ? AND date <= ?";
|
||||
private static final String SELECT_BY_STOCK_QRY = "SELECT * from headlines WHERE stock = ? AND date = ? AND dataset = 1";
|
||||
private static final String SELECT_BY_DATE_RANGE_QRY = "SELECT * from headlines WHERE stock = ? AND date >= ? AND date <= ? AND dataset = ?";
|
||||
|
||||
private static final String ASSIGN_RANDOM_PCT_QRY = "update headlines set dataset = (select FLOOR(RAND() * (200 - 101) + 101))";
|
||||
private static final String REMAP_TRAINING_QRY = "update headlines set dataset = 1 where dataset >= 101 and dataset <= (100 + ?)";
|
||||
private static final String REMAP_TEST_QRY = "update headlines set dataset = 2 where dataset >= (100 + ?) and dataset <= (100 + ?)";
|
||||
private static final String REMAP_VAL_QRY = "update headlines set dataset = 3 where dataset >= (100 + ?) and dataset <= 200";
|
||||
|
||||
private JdbcTemplate jdbcTemplate;
|
||||
|
||||
@Override
|
||||
public boolean assignRandomDatasets(int training, int test, int validation) {
|
||||
if (training + test + validation != 100) {
|
||||
return false;
|
||||
}
|
||||
jdbcTemplate.update(ASSIGN_RANDOM_PCT_QRY);
|
||||
jdbcTemplate.update(REMAP_TRAINING_QRY,training);
|
||||
jdbcTemplate.update(REMAP_TEST_QRY,training,training+test);
|
||||
jdbcTemplate.update(REMAP_VAL_QRY,training+test);
|
||||
return true;
|
||||
}
|
||||
|
||||
public int deleteById(int headlineId) {
|
||||
return jdbcTemplate.update(DELETE_BY_ID_STMT,
|
||||
new RequestMapper(), headlineId);
|
||||
@@ -64,12 +84,12 @@ public class HeadlineDaoImpl implements HeadlineDao {
|
||||
|
||||
public List<Headline> select(String stock, Date date) {
|
||||
return jdbcTemplate.query(SELECT_BY_STOCK_QRY,
|
||||
new RequestMapper(), stock, date);
|
||||
new RequestMapper(), stock, date, 1);
|
||||
}
|
||||
|
||||
public List<Headline> select(String stock, Date startDate, Date endDate) {
|
||||
public List<Headline> select(String stock, Date startDate, Date endDate, int dataset) {
|
||||
return jdbcTemplate.query(SELECT_BY_DATE_RANGE_QRY,
|
||||
new RequestMapper(), stock, startDate, endDate);
|
||||
new RequestMapper(), stock, startDate, endDate, dataset);
|
||||
}
|
||||
|
||||
@Autowired
|
||||
@@ -82,6 +102,7 @@ public class HeadlineDaoImpl implements HeadlineDao {
|
||||
@Override
|
||||
public Headline mapRow(ResultSet rs, int arg1) throws SQLException {
|
||||
Headline headline = new Headline();
|
||||
headline.setId(rs.getInt("id"));
|
||||
headline.setText(rs.getString("text"));
|
||||
headline.setStock(rs.getString("stock"));
|
||||
headline.setDate(rs.getDate("date"));
|
||||
@@ -90,4 +111,14 @@ public class HeadlineDaoImpl implements HeadlineDao {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCount() {
|
||||
return jdbcTemplate.queryForInt(COUNT_ALL_QRY);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCount(int dataset) {
|
||||
return jdbcTemplate.queryForInt(COUNT_DATASET_QRY,dataset);
|
||||
}
|
||||
}
|
||||
57
src/net/woodyfolsom/cs6601/p3/domain/StockPrice.java
Normal file
57
src/net/woodyfolsom/cs6601/p3/domain/StockPrice.java
Normal file
@@ -0,0 +1,57 @@
|
||||
package net.woodyfolsom.cs6601.p3.domain;
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
public class StockPrice {
|
||||
//Date,Open,High,Low,Close,Volume,Adj_Close
|
||||
//2012-04-17,0.28,0.33,0.28,0.32,3408100,0.32
|
||||
final Date date;
|
||||
|
||||
final double open;
|
||||
final double high;
|
||||
final double low;
|
||||
final double close;
|
||||
final double adjClose;
|
||||
|
||||
final long volume;
|
||||
|
||||
public StockPrice(Date date, double open, double high, double low, double close,
|
||||
long volume, double adjClose) {
|
||||
super();
|
||||
this.date = date;
|
||||
this.open = open;
|
||||
this.high = high;
|
||||
this.low = low;
|
||||
this.close = close;
|
||||
this.volume = volume;
|
||||
this.adjClose = adjClose;
|
||||
}
|
||||
|
||||
public Date getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public double getOpen() {
|
||||
return open;
|
||||
}
|
||||
|
||||
public double getClose() {
|
||||
return close;
|
||||
}
|
||||
|
||||
public double getHigh() {
|
||||
return high;
|
||||
}
|
||||
|
||||
public double getLow() {
|
||||
return low;
|
||||
}
|
||||
|
||||
public long getVolume() {
|
||||
return volume;
|
||||
}
|
||||
|
||||
public double getAdjClose() {
|
||||
return adjClose;
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,17 @@
|
||||
package net.woodyfolsom.cs6601.p3.ngram;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.text.DecimalFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Calendar;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
@@ -14,7 +21,9 @@ import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.woodyfolsom.cs6601.p3.StockUtil;
|
||||
import net.woodyfolsom.cs6601.p3.domain.Headline;
|
||||
import net.woodyfolsom.cs6601.p3.domain.StockPrice;
|
||||
|
||||
public class NGramModel {
|
||||
static final int MAX_N_GRAM_LENGTH = 3;
|
||||
@@ -25,6 +34,8 @@ public class NGramModel {
|
||||
private static final String UNK = "<unk>";
|
||||
|
||||
private Map<Integer, NGramDistribution> nGrams;
|
||||
private Map<Integer, Map<NGram,Double>> nGramPriceAvg;
|
||||
|
||||
private int[] totalNGramCounts = new int[MAX_N_GRAM_LENGTH + 1];
|
||||
|
||||
private Pattern wordPattern = Pattern.compile("\\w+");
|
||||
@@ -88,9 +99,13 @@ public class NGramModel {
|
||||
for (int i = 0; i <= MAX_N_GRAM_LENGTH; i++) {
|
||||
nGrams.put(i, new NGramDistribution());
|
||||
}
|
||||
nGramPriceAvg = new HashMap<Integer, Map<NGram,Double>>();
|
||||
for (int i = 0; i <= MAX_N_GRAM_LENGTH; i++) {
|
||||
nGramPriceAvg.put(i, new HashMap<NGram,Double>());
|
||||
}
|
||||
}
|
||||
|
||||
private void addNGram(int nGramLength, NGram nGram) {
|
||||
|
||||
private void addNGram(int nGramLength, NGram nGram, String stockName, Date date, Map<String, Map<Date,StockPrice>> stockTrends) {
|
||||
if (nGram.size() < nGramLength) {
|
||||
System.out.println("Cannot create " + nGramLength + "-gram from: "
|
||||
+ nGram);
|
||||
@@ -105,10 +120,31 @@ public class NGramModel {
|
||||
} else {
|
||||
nGramCounts.put(nGramCopy, 1);
|
||||
}
|
||||
|
||||
Map<NGram, Double> nGramPriceAvgs = nGramPriceAvg.get(nGramLength);
|
||||
|
||||
NGram nGramCopy2 = nGram.copy(nGramLength);
|
||||
|
||||
//TODO GET NEXT TRADING DAY'S DATE
|
||||
Date nextDay = StockUtil.getNextTradingDay(date);
|
||||
StockPrice stockPrice = stockTrends.get(stockName).get(nextDay);
|
||||
double percentChange;
|
||||
if (stockPrice == null) {
|
||||
percentChange = 0.0;
|
||||
} else {
|
||||
percentChange = StockUtil.getPercentChange(stockPrice);
|
||||
}
|
||||
|
||||
if (nGramPriceAvgs.containsKey(nGramCopy2)) {
|
||||
double totalPercentChange = nGramPriceAvgs.get(nGramCopy);
|
||||
nGramPriceAvgs.put(nGramCopy2, totalPercentChange + percentChange);
|
||||
} else {
|
||||
nGramPriceAvgs.put(nGramCopy2, percentChange);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given an arbitrary String, replace punctutation with spaces, remove
|
||||
* Given an arbitrary String, replace punctuation with spaces, remove
|
||||
* non-alphanumeric characters, prepend with <START> token, append <END>
|
||||
* token.
|
||||
*
|
||||
@@ -193,12 +229,11 @@ public class NGramModel {
|
||||
+ " recognized n-grams in verification corpus: " + perplexity);
|
||||
}
|
||||
|
||||
private void generateModel(List<Headline> traininSet, boolean genRandom,
|
||||
boolean useUnk) throws FileNotFoundException, IOException {
|
||||
StringBuilder currentLine = new StringBuilder();
|
||||
List<String> fileByLines = new ArrayList<String>();
|
||||
private void generateModel(List<Headline> trainingSet, boolean genRandom,
|
||||
boolean useUnk, Map<String,Map<Date,StockPrice>> stockTrends) throws FileNotFoundException, IOException {
|
||||
//List<String> fileByLines = new ArrayList<String>();
|
||||
|
||||
for (Headline headline : traininSet) {
|
||||
for (Headline headline : trainingSet) {
|
||||
String headlineText = headline.getText();
|
||||
if (headlineText.length() == 0) {
|
||||
continue;
|
||||
@@ -206,6 +241,9 @@ public class NGramModel {
|
||||
String sanitizedLine = sanitize(headline.getText());
|
||||
// split on whitespace
|
||||
String[] tokens = sanitizedLine.toLowerCase().split("\\s+");
|
||||
|
||||
StringBuilder currentLine = new StringBuilder();
|
||||
|
||||
for (String token : tokens) {
|
||||
if (!isWord(token)) {
|
||||
continue;
|
||||
@@ -222,67 +260,67 @@ public class NGramModel {
|
||||
|
||||
if (END.equals(word)) {
|
||||
currentLine.append(word);
|
||||
fileByLines.add(currentLine.toString());
|
||||
currentLine = new StringBuilder();
|
||||
//fileByLines.add(currentLine.toString());
|
||||
//currentLine = new StringBuilder();
|
||||
} else {
|
||||
currentLine.append(word);
|
||||
currentLine.append(" ");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (String str : fileByLines) {
|
||||
String str = currentLine.toString();
|
||||
System.out.println(str);
|
||||
NGram currentNgram = new NGram(MAX_N_GRAM_LENGTH);
|
||||
for (String token : str.split("\\s+")) {
|
||||
currentNgram.add(token);
|
||||
for (int i = 0; i <= currentNgram.size(); i++) {
|
||||
addNGram(currentNgram.size() - i,
|
||||
currentNgram.subNGram(i, currentNgram.size()));
|
||||
currentNgram.subNGram(i, currentNgram.size()), headline.getStock(), headline.getDate(), stockTrends);
|
||||
totalNGramCounts[currentNgram.size() - i]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Most common words: ");
|
||||
|
||||
List<Entry<NGram, Integer>> unigrams = new ArrayList<Entry<NGram, Integer>>(
|
||||
nGrams.get(1).entrySet());
|
||||
Collections.sort(unigrams, new NGramComparator());
|
||||
|
||||
List<Entry<NGram, Integer>> bigrams = new ArrayList<Entry<NGram, Integer>>(
|
||||
nGrams.get(2).entrySet());
|
||||
Collections.sort(bigrams, new NGramComparator());
|
||||
|
||||
List<Entry<NGram, Integer>> trigrams = new ArrayList<Entry<NGram, Integer>>(
|
||||
nGrams.get(3).entrySet());
|
||||
Collections.sort(trigrams, new NGramComparator());
|
||||
|
||||
for (int i = 1; i <= 10; i++) {
|
||||
System.out
|
||||
.println(i
|
||||
+ ". "
|
||||
+ unigrams.get(i - 1).getKey()
|
||||
+ " : "
|
||||
+ (((double) (unigrams.get(i - 1).getValue()) / totalNGramCounts[1])));
|
||||
}
|
||||
|
||||
for (int i = 1; i <= 10; i++) {
|
||||
System.out
|
||||
.println(i
|
||||
+ ". "
|
||||
+ bigrams.get(i - 1).getKey()
|
||||
+ " : "
|
||||
+ (((double) (bigrams.get(i - 1).getValue()) / totalNGramCounts[2])));
|
||||
}
|
||||
|
||||
for (int i = 1; i <= 10; i++) {
|
||||
System.out
|
||||
.println(i
|
||||
+ ". "
|
||||
+ trigrams.get(i - 1).getKey()
|
||||
+ " : "
|
||||
+ (((double) (trigrams.get(i - 1).getValue()) / totalNGramCounts[3])));
|
||||
DecimalFormat decFmt = new DecimalFormat("###0.0000");
|
||||
for (int modelIndex = 1; modelIndex <= 3; modelIndex++) {
|
||||
File file = new File(modelIndex + "grams.txt");
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file)));
|
||||
List<Entry<NGram, Integer>> ngrams = new ArrayList<Entry<NGram, Integer>>(
|
||||
nGrams.get(modelIndex).entrySet());
|
||||
Collections.sort(ngrams, new NGramComparator());
|
||||
System.out.println("Highest frequency " + modelIndex + "-grams:");
|
||||
for (int i = 1; i <= 10; i++) {
|
||||
System.out
|
||||
.println(i
|
||||
+ ". "
|
||||
+ ngrams.get(i - 1).getKey()
|
||||
+ " : "
|
||||
+ (((double) (ngrams.get(i - 1).getValue()) / totalNGramCounts[1])));
|
||||
}
|
||||
Map<NGram,Double> pricesForModel = nGramPriceAvg.get(modelIndex);
|
||||
for (int nGramIndex = 1; nGramIndex <= ngrams.size(); nGramIndex++) {
|
||||
NGram key = ngrams.get(nGramIndex - 1).getKey();
|
||||
writer.write(key.toString());
|
||||
writer.write(",");
|
||||
int count = ngrams.get(nGramIndex - 1).getValue();
|
||||
writer.write(Integer.toString(count));
|
||||
writer.write(",");
|
||||
double avgPrice;
|
||||
try {
|
||||
avgPrice = pricesForModel.get(key);
|
||||
System.out.println("Avg price for " + modelIndex + "-gram " + key +": " + avgPrice);
|
||||
} catch (NullPointerException npe) {
|
||||
System.out.println("null avgPrice for " + modelIndex + "-gram " + key);
|
||||
avgPrice = 0.0;
|
||||
}
|
||||
writer.write(decFmt.format(avgPrice/(double)count));
|
||||
writer.write("\n");
|
||||
}
|
||||
try {
|
||||
writer.close();
|
||||
} catch (IOException ioe) {
|
||||
System.out.println(ioe.getMessage());
|
||||
}
|
||||
}
|
||||
if (genRandom) {
|
||||
for (int nGramLength = 1; nGramLength <= MAX_N_GRAM_LENGTH; nGramLength++) {
|
||||
@@ -333,11 +371,11 @@ public class NGramModel {
|
||||
}
|
||||
|
||||
public void reportModel(List<Headline> trainingSet,
|
||||
List<Headline> validationSet) {
|
||||
List<Headline> validationSet, Map<String,Map<Date,StockPrice>> stockTrends) {
|
||||
try {
|
||||
NGramModel ngm = new NGramModel();
|
||||
boolean doCalcPerplexity = true;
|
||||
ngm.generateModel(trainingSet, !doCalcPerplexity, doCalcPerplexity);
|
||||
ngm.generateModel(trainingSet, !doCalcPerplexity, doCalcPerplexity, stockTrends);
|
||||
if (doCalcPerplexity) {
|
||||
for (int i = 1; i <= MAX_N_GRAM_LENGTH; i++) {
|
||||
ngm.calcPerplexity(validationSet, i, true);
|
||||
|
||||
@@ -6,8 +6,11 @@ import java.util.List;
|
||||
import net.woodyfolsom.cs6601.p3.domain.Headline;
|
||||
|
||||
public interface HeadlineService {
|
||||
boolean assignRandomDatasets(int training, int test, int validation);
|
||||
int getCount();
|
||||
int getCount(int dataset);
|
||||
int insertHeadline(Headline headline);
|
||||
int[] insertHeadlines(List<Headline> headline);
|
||||
List<Headline> getHeadlines(String stock, Date date);
|
||||
List<Headline> getHeadlines(String stock, Date startDate, Date endDate);
|
||||
List<Headline> getHeadlines(String stock, Date startDate, Date endDate, int dataset);
|
||||
}
|
||||
@@ -34,7 +34,22 @@ public class MySQLHeadlineServiceImpl implements HeadlineService {
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Headline> getHeadlines(String stock, Date startDate, Date endDate) {
|
||||
return headlineDao.select(stock, startDate, endDate);
|
||||
public List<Headline> getHeadlines(String stock, Date startDate, Date endDate, int dataset) {
|
||||
return headlineDao.select(stock, startDate, endDate, dataset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean assignRandomDatasets(int training, int test, int validation) {
|
||||
return headlineDao.assignRandomDatasets(training, test, validation);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCount() {
|
||||
return headlineDao.getCount();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCount(int dataset) {
|
||||
return headlineDao.getCount(dataset);
|
||||
}
|
||||
}
|
||||
@@ -89,7 +89,22 @@ public class YahooHeadlineServiceImpl implements HeadlineService {
|
||||
|
||||
@Override
|
||||
public List<Headline> getHeadlines(String stock, Date startDate,
|
||||
Date endDate) {
|
||||
Date endDate, int dataset) {
|
||||
throw new UnsupportedOperationException("This implementation does not support getting headlines for a date range.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean assignRandomDatasets(int training, int test, int validation) {
|
||||
throw new UnsupportedOperationException("This implementation does not support this method.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCount() {
|
||||
throw new UnsupportedOperationException("This implementation does not support this method");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCount(int dataset) {
|
||||
throw new UnsupportedOperationException("This implementation does not support this method");
|
||||
}
|
||||
}
|
||||
@@ -1,21 +1,23 @@
|
||||
package net.woodyfolsom.cs6601.p3.dao;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import net.woodyfolsom.cs6601.p3.svc.HeadlineService;
|
||||
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import org.springframework.context.ApplicationContext;
|
||||
import org.springframework.context.support.ClassPathXmlApplicationContext;
|
||||
import org.springframework.context.support.FileSystemXmlApplicationContext;
|
||||
|
||||
public class MySQLHeadlineDaoImplTest {
|
||||
private static HeadlineService headlineSvc;
|
||||
|
||||
@BeforeClass
|
||||
public static void setUp() {
|
||||
ApplicationContext context=new ClassPathXmlApplicationContext(new String[]{"/AppContext.xml"});
|
||||
ApplicationContext context=new FileSystemXmlApplicationContext(new String[]{"AppContext.xml"});
|
||||
headlineSvc = (HeadlineService) context
|
||||
.getBean("mySQLHeadlineSvc");
|
||||
}
|
||||
@@ -24,4 +26,29 @@ public class MySQLHeadlineDaoImplTest {
|
||||
public void testSelect() {
|
||||
assertNotNull(headlineSvc);
|
||||
}
|
||||
|
||||
//Change this back to @Test to run it... but beware, it shuffles the datasets. Best done n times for n-fold cross validation.
|
||||
@Ignore
|
||||
public void testAssignRandomDatasets() {
|
||||
|
||||
int trainingPct = 80;
|
||||
int testPct = 10;
|
||||
int valPct = 10;
|
||||
|
||||
//assignment fails if character is ommitted from valPct (80% 10% 1% by accident)
|
||||
assertFalse(headlineSvc.assignRandomDatasets(trainingPct,testPct,valPct/10));
|
||||
//assignment succeeds if requested ratio is 8-1-1
|
||||
assertTrue(headlineSvc.assignRandomDatasets(trainingPct,testPct,valPct));
|
||||
|
||||
int allCount = headlineSvc.getCount();
|
||||
int trainingCount = headlineSvc.getCount(1);
|
||||
int testCount = headlineSvc.getCount(2);
|
||||
int valCount = headlineSvc.getCount(3);
|
||||
|
||||
assertEquals(trainingCount + testCount + valCount, allCount);
|
||||
|
||||
assertEquals((double)trainingCount/allCount,(double)trainingPct / 100.0,0.01);
|
||||
assertEquals((double)testCount/allCount,(double)testPct / 100.0,0.01);
|
||||
assertEquals((double)valCount/allCount,(double)valPct / 100.0,0.01);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user