HeadlinePuller can import all news articles for Fortune 50 within a date range from Yahoo Finance historical data, inserting the headline text into the headlines table of database cs6601p3 on woodyfolsom.net:3306.

Limited to 25 headline per day (with many reptitions) per Yahoo Finance REST functionality.
This commit is contained in:
Woody Folsom
2012-04-15 10:18:02 -04:00
parent d700d97124
commit 027adff2dd
40 changed files with 322 additions and 39 deletions

103
build.xml Normal file
View File

@@ -0,0 +1,103 @@
<project name="CS6601_P3" default="dist" basedir=".">
<description>
simple example build file
</description>
<!-- set global properties for this build -->
<property name="build" location="classes" />
<property name="dist" location="dist" />
<property name="docs" location="docs" />
<property name="lib" location="lib" />
<property name="project.name" value="CS6601_P3" />
<property name="res" location="res" />
<property name="src" location="src" />
<property name="test" location="test" />
<path id="build.classpath">
<fileset dir="${lib}">
<include name="**/*.jar" />
</fileset>
</path>
<path id="classpath.test">
<path refid="build.classpath" />
<pathelement location="lib/junit-4.10.jar" />
<pathelement location="${build}" />
</path>
<target name="clean" description="clean up">
<!-- Delete the ${build} and ${dist} directory trees -->
<delete dir="${build}" />
<delete dir="${dist}" />
</target>
<target name="compile" depends="copy-resources">
<!-- Compile the java code from ${src} into ${build} -->
<javac srcdir="${src}" destdir="${build}" classpathref="build.classpath" debug="true" source="1.6" target="1.6"/>
</target>
<target name="compile-test" depends="compile">
<javac srcdir="${test}" destdir="${build}" debug="true">
<classpath refid="classpath.test"/>
</javac>
</target>
<target name="copy-resources" depends="init" >
<copy todir="${dist}">
<fileset dir="${res}">
<include name="**/*" />
</fileset>
</copy>
<copy todir="${dist}/lib">
<fileset dir="${lib}">
<include name="**/*" />
</fileset>
</copy>
</target>
<target name="dist" depends="copy-resources, compile"
description="generate the distribution">
<jar jarfile="${dist}/${project.name}.jar">
<fileset dir="${build}" excludes="**/*Test.class" />
<manifest>
<attribute name="Main-Class" value="net.woodyfolsom.cs6601.p3.HeadlinePuller" />
<attribute name="Class-Path" value="lib/aopalliance.jar lib/org.springframework.aspects-3.1.1.RELEASE.jar
lib/aspectj-1.6.12.jar lib/org.springframework.beans-3.1.1.RELEASE.jar
lib/aspectjweaver-1.6.8.jar lib/org.springframework.context.support-3.1.1.RELEASE.jar
lib/commons-logging-1.1.1.jar lib/org.springframework.context-3.1.1.RELEASE.jar
lib/junit-4.10.jar lib/org.springframework.core-3.1.1.RELEASE.jar
lib/log4j-1.2.16.jar lib/org.springframework.expression-3.1.1.RELEASE.jar
lib/mysql-connector-java-5.1.18-bin.jar lib/org.springframework.jdbc-3.1.1.RELEASE.jar
lib/org.springframework.aop-3.1.1.RELEASE.jar lib/org.springframework.transaction-3.1.1.RELEASE.jar
lib/org.springframework.asm-3.1.1.RELEASE.jar lib/spring-data-jdbc-core-1.0.0.RC1.jar"/>
</manifest>
</jar>
</target>
<!-- Creates Javadoc -->
<target name="docs" depends="compile">
<javadoc packagenames="src" sourcepath="${src}" destdir="${docs}">
<!-- Define which files / directory should get included, we include all -->
<fileset dir="${src}">
<include name="**" />
</fileset>
</javadoc>
</target>
<target name="init">
<!-- Create the time stamp -->
<tstamp />
<!-- Create the build directory structure used by compile -->
<mkdir dir="${build}" />
</target>
<target name="test" depends="compile-test">
<junit haltonfailure="true">
<classpath refid="classpath.test" />
<formatter type="brief" usefile="false" />
<batchtest>
<fileset dir="${build}" includes="**/*Test.class" />
</batchtest>
</junit>
</target>
</project>

Binary file not shown.

Binary file not shown.

BIN
dist/CS6601_P3.jar vendored Normal file

Binary file not shown.

BIN
dist/lib/aopalliance.jar vendored Normal file

Binary file not shown.

BIN
dist/lib/aspectj-1.6.12.jar vendored Normal file

Binary file not shown.

BIN
dist/lib/aspectjweaver-1.6.8.jar vendored Normal file

Binary file not shown.

BIN
dist/lib/commons-logging-1.1.1.jar vendored Normal file

Binary file not shown.

BIN
dist/lib/junit-4.10.jar vendored Normal file

Binary file not shown.

BIN
dist/lib/log4j-1.2.16.jar vendored Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

50
dist/stock_symbols.csv vendored Normal file
View File

@@ -0,0 +1,50 @@
1,Wal-Mart Stores,WMT
2,Exxon Mobil,XOM
3,Chevron,CVX
4,ConocoPhillips,COP
5,Fannie Mae,FNMA
6,General Electric,GE
7,Berkshire Hathaway,BRKA
8,General Motors,GM
9,Bank of America Corp.,BAC
10,Ford Motor,F
11,Hewlett-Packard,HPQ
12,AT&T,T
13,J.P. Morgan Chase & Co.,JPM
14,Citigroup,C
15,McKesson,MCK
16,Verizon Communications,VZ
17,American International Group,AIG
18,International Business Machines,IBM
19,Cardinal Health,CAH
20,Freddie Mac,FMCC
21,CVS Caremark,CVS
22,UnitedHealth Group,UNH
23,Wells Fargo,WFC
24,Valero Energy,VLO
25,Kroger,KR
26,Procter & Gamble,PG
27,AmerisourceBergen,ABC
28,Costco Wholesale,COST
29,Marathon Oil,MRO
30,Home Depot,HD
31,Pfizer,PFE
32,Walgreen,WAG
33,Target,TGT
34,Medco Health Solutions,MHS
35,Apple,AAPL
36,Boeing,BA
37,State Farm Insurance Cos.,SNPAX
38,Microsoft,MSFT
39,Archer Daniels Midland,ADM
40,Johnson & Johnson,JNJ
41,Dell,DELL
42,WellPoint,WLP
43,PepsiCo,PEP
44,United Technologies,UTX
45,Dow Chemical,DOW
46,MetLife,MET
47,Best Buy,BBY
48,United Parcel Service,UPS
49,Kraft Foods,KFT
50,Lowe's,LOW
1 1 Wal-Mart Stores WMT
2 2 Exxon Mobil XOM
3 3 Chevron CVX
4 4 ConocoPhillips COP
5 5 Fannie Mae FNMA
6 6 General Electric GE
7 7 Berkshire Hathaway BRKA
8 8 General Motors GM
9 9 Bank of America Corp. BAC
10 10 Ford Motor F
11 11 Hewlett-Packard HPQ
12 12 AT&T T
13 13 J.P. Morgan Chase & Co. JPM
14 14 Citigroup C
15 15 McKesson MCK
16 16 Verizon Communications VZ
17 17 American International Group AIG
18 18 International Business Machines IBM
19 19 Cardinal Health CAH
20 20 Freddie Mac FMCC
21 21 CVS Caremark CVS
22 22 UnitedHealth Group UNH
23 23 Wells Fargo WFC
24 24 Valero Energy VLO
25 25 Kroger KR
26 26 Procter & Gamble PG
27 27 AmerisourceBergen ABC
28 28 Costco Wholesale COST
29 29 Marathon Oil MRO
30 30 Home Depot HD
31 31 Pfizer PFE
32 32 Walgreen WAG
33 33 Target TGT
34 34 Medco Health Solutions MHS
35 35 Apple AAPL
36 36 Boeing BA
37 37 State Farm Insurance Cos. SNPAX
38 38 Microsoft MSFT
39 39 Archer Daniels Midland ADM
40 40 Johnson & Johnson JNJ
41 41 Dell DELL
42 42 WellPoint WLP
43 43 PepsiCo PEP
44 44 United Technologies UTX
45 45 Dow Chemical DOW
46 46 MetLife MET
47 47 Best Buy BBY
48 48 United Parcel Service UPS
49 49 Kraft Foods KFT
50 50 Lowe's LOW

28
res/AppContext.xml Normal file
View File

@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:aop="http://www.springframework.org/schema/aop"
xmlns:context="http://www.springframework.org/schema/context"
xsi:schemaLocation="http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans-2.5.xsd
http://www.springframework.org/schema/aop
http://www.springframework.org/schema/aop/spring-aop-2.5.xsd
http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context-2.5.xsd"
default-autowire="byName">
<bean id="dmdataSource"
class="org.springframework.jdbc.datasource.DriverManagerDataSource">
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
<property name="url" value="jdbc:mysql://woodyfolsom.net:3306/cs6601p3" />
<property name="username" value="cs6601" />
<property name="password" value="n0nst@p" />
</bean>
<bean id="mySQLHeadlineSvc" class="net.woodyfolsom.cs6601.p3.svc.MySQLHeadlineServiceImpl" />
<bean id="yahooHeadlineSvc" class="net.woodyfolsom.cs6601.p3.svc.YahooHeadlineServiceImpl" />
<context:annotation-config />
<context:component-scan base-package="net.woodyfolsom.cs6601.p3"/>
</beans>

View File

@@ -6,47 +6,119 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.context.support.FileSystemXmlApplicationContext;
import org.springframework.stereotype.Component;
import net.woodyfolsom.cs6601.p3.domain.Company;
import net.woodyfolsom.cs6601.p3.domain.Headline;
import net.woodyfolsom.cs6601.p3.svc.HeadlineService;
import net.woodyfolsom.cs6601.p3.svc.YahooHeadlineServiceImpl;
@Component
public class HeadlinePuller {
private static final File stockSymbolsCSV = new File("stock_symbols.csv");
private static final int IO_EXCEPTION = 1;
private static final int STOCK_SYMBOL_CSV_NOT_FOUND = 2;
private static final int INVALID_END_DATE = 1;
private static final int INVALID_MODE = 2;
private static final int INVALID_START_DATE = 3;
private static final int IO_EXCEPTION = 4;
private static final int NO_ARGS = 5;
private static final int STOCK_SYMBOL_CSV_NOT_FOUND = 6;
@Autowired
HeadlineService mySQLHeadlineServiceImpl;
@Autowired
HeadlineService yahooHeadlineServiceImpl;
private static void printUsage() {
System.out
.println("Usage: java -jar cs6601p3.jar [insert|delete] mm/dd/yyyy-mm/dd/yyyy");
}
private enum MODE {
insert, invalid, delete
}
public static void main(String... args) {
ApplicationContext context=new ClassPathXmlApplicationContext(new String[]{"/AppContext.xml"});
HeadlinePuller headlinePuller = context.getBean(HeadlinePuller.class);
MODE mode = MODE.invalid;
if (args.length != 2) {
printUsage();
System.exit(NO_ARGS);
} else {
try {
List<Company> fortune50 = headlinePuller.getFortune50(stockSymbolsCSV);
for (Company company : fortune50) {
System.out.println("Getting headlines for Fortune 50 company #" + company.getId() + " (" + company.getName() + ")...");
Date today = new Date();
List<Headline> headlines = headlinePuller.pullHeadlines(company.getStockSymbol(), today);
headlinePuller.insertHeadlines(company.getStockSymbol(), today, headlines);
System.out.println("Waiting 10 seconds to accommodate Yahoo throttling...");
mode = MODE.valueOf(args[0]);
} catch (Exception ex) {
System.out.println("Invalid mode: " + args[0]);
}
}
if (mode == MODE.invalid) {
System.exit(INVALID_MODE);
}
if (mode == MODE.delete) {
System.out.println("Mode = delete. All data will be purged from HEADLINES table. Continue? [y/n]");
byte[] buf = new byte[10];
try {
Thread.sleep(10000L);
} catch (InterruptedException ie) {
System.out.println("Interrupted while waiting, exiting");
int read = System.in.read(buf,0,10);
String conf = new String(buf,0,read,Charset.defaultCharset());
System.out.println("CONF = '" + conf +"'");
if (conf.charAt(0) == 'y') {
System.out.println("Delete mode confirmed. Continuing...");
System.exit(0);
} else {
System.out.println("Delete mode cancelled.");
System.exit(0);
}
} catch (IOException ioe) {
System.exit(IO_EXCEPTION);
}
}
String[] dateFields = args[1].split("-");
DateFormat dateFormat = new SimpleDateFormat("MM/dd/yyyy");
Date startDate = null;
try {
startDate = dateFormat.parse(dateFields[0]);
} catch (ParseException pe) {
System.out.println("Invalid start date: " + dateFields[0]);
System.exit(INVALID_START_DATE);
}
Date endDate = null;
try {
endDate = dateFormat.parse(dateFields[1]);
} catch (ParseException pe) {
System.out.println("Invalid end date: " + dateFields[0]);
System.exit(INVALID_END_DATE);
}
ApplicationContext context = new FileSystemXmlApplicationContext(
new String[] { "AppContext.xml" });
HeadlinePuller headlinePuller = context.getBean(HeadlinePuller.class);
Calendar calendar = Calendar.getInstance();
try {
List<Company> fortune50 = headlinePuller
.getFortune50(stockSymbolsCSV);
for (Company company : fortune50) {
System.out.println("Getting headlines for Fortune 50 company #"
+ company.getId() + " (" + company.getName() + ")...");
Date today;
for (calendar.setTime(startDate); (today = calendar.getTime())
.compareTo(endDate) <= 0; calendar
.add(Calendar.DATE, 1)) {
List<Headline> headlines = headlinePuller.pullHeadlines(
company.getStockSymbol(), today);
int[] updates = headlinePuller.mySQLHeadlineServiceImpl.insertHeadlines(headlines);
System.out.println(updates.length + " rows updated");
}
}
} catch (FileNotFoundException fnfe) {
@@ -60,23 +132,15 @@ public class HeadlinePuller {
}
}
private void insertHeadlines(String stockSymbol, Date date, List<Headline> headlines) {
for (Headline headline : headlines) {
mySQLHeadlineServiceImpl.insertHeadline(headline);
}
}
private List<Headline> pullHeadlines(String stockSymbol, Date date) {
List<Headline> headlines = yahooHeadlineServiceImpl.getHeadlines(stockSymbol, date);
for (Headline headline : headlines) {
System.out.println("Got headline: " + headline);
}
List<Headline> headlines = yahooHeadlineServiceImpl.getHeadlines(
stockSymbol, date);
System.out.println("Pulled " + headlines.size() + " headlines for " + stockSymbol + " on " + date);
return headlines;
}
private List<Company> getFortune50(File csvFile) throws FileNotFoundException,
IOException {
private List<Company> getFortune50(File csvFile)
throws FileNotFoundException, IOException {
List<Company> fortune50 = new ArrayList<Company>();
FileInputStream fis = new FileInputStream(csvFile);
InputStreamReader reader = new InputStreamReader(fis);
@@ -88,10 +152,12 @@ public class HeadlinePuller {
}
String[] fields = csvline.split(",");
if (fields.length != 3) {
throw new RuntimeException("Badly formatted csv file name (3 values expected): " + csvline);
throw new RuntimeException(
"Badly formatted csv file name (3 values expected): "
+ csvline);
}
int id = Integer.valueOf(fields[0]);
fortune50.add(new Company(id,fields[1],fields[2]));
fortune50.add(new Company(id, fields[1], fields[2]));
}
return fortune50;
}

View File

@@ -8,7 +8,8 @@ import net.woodyfolsom.cs6601.p3.domain.Headline;
public interface HeadlineDao {
int deleteById(int id);
int insert(Headline player);
int insert(Headline headline);
int[] insertBatch(List<Headline> headlines);
Headline select(int id);
List<Headline> select(String stock, Date date);

View File

@@ -1,5 +1,6 @@
package net.woodyfolsom.cs6601.p3.dao;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Date;
@@ -8,6 +9,7 @@ import java.util.List;
import javax.sql.DataSource;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.jdbc.core.BatchPreparedStatementSetter;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.jdbc.core.simple.ParameterizedRowMapper;
import org.springframework.stereotype.Repository;
@@ -34,6 +36,26 @@ public class HeadlineDaoImpl implements HeadlineDao {
return jdbcTemplate.update(INSERT_STMT, headline.getText(), headline.getDate(), headline.getStock(), headline.getDataset());
}
public int[] insertBatch(final List<Headline> headlines){
return jdbcTemplate.batchUpdate(INSERT_STMT, new BatchPreparedStatementSetter() {
@Override
public void setValues(PreparedStatement ps, int i) throws SQLException {
Headline headline = headlines.get(i);
ps.setString(1, headline.getText());
ps.setDate(2, new java.sql.Date(headline.getDate().getTime()));
ps.setString(3, headline.getStock() );
ps.setInt(4, headline.getDataset() );
}
@Override
public int getBatchSize() {
return headlines.size();
}
});
}
public Headline select(int headlineId) {
return jdbcTemplate.queryForObject(SELECT_BY_ID_QRY,
new RequestMapper(), headlineId);
@@ -54,6 +76,10 @@ public class HeadlineDaoImpl implements HeadlineDao {
@Override
public Headline mapRow(ResultSet rs, int arg1) throws SQLException {
Headline headline = new Headline();
headline.setText(rs.getString("text"));
headline.setStock(rs.getString("stock"));
headline.setDate(rs.getDate("date"));
headline.setDataset(rs.getInt("dataset"));
return headline;
}

View File

@@ -7,5 +7,6 @@ import net.woodyfolsom.cs6601.p3.domain.Headline;
public interface HeadlineService {
int insertHeadline(Headline headline);
int[] insertHeadlines(List<Headline> headline);
List<Headline> getHeadlines(String stock, Date date);
}

View File

@@ -23,6 +23,11 @@ public class MySQLHeadlineServiceImpl implements HeadlineService {
return headlineDao.insert(headline);
}
@Override
public int[] insertHeadlines(List<Headline> headlines) {
return headlineDao.insertBatch(headlines);
}
@Override
public List<Headline> getHeadlines(String stock, Date date) {
return headlineDao.select(stock, date);

View File

@@ -31,13 +31,18 @@ public class YahooHeadlineServiceImpl implements HeadlineService {
private static final String STORY_DATE_FIELD = "STORY_DATE";
private static final String STOCK_SYMBOL_FIELD = "STOCK_SYMBOL";
private static final String QUERY_URL = "http://query.yahooapis.com/v1/public/yql?q=select%20content%20from%20html%20where%20url%3D%22http%3A%2F%2Ffinance.yahoo.com%2Fq%2Fh%3Fs%3DSTOCK_SYMBOL%26t%3DSTORY_DATE%22%20and%20xpath%3D'%2F%2Fdiv%5B%40class%3D%22mod%20yfi_quote_headline%20withsky%22%5D%2Ful%2Fli%2Fa'&diagnostics=true";
private static final String QUERY_URL = "http://query.yahooapis.com/v1/public/yql?q=select%20content%20from%20html%20where%20url%3D%22http%3A%2F%2Ffinance.yahoo.com%2Fq%2Fh%3Fs%3DSTOCK_SYMBOL%26t%3DSTORY_DATE%22%20and%20xpath%3D'%2F%2Fdiv%5B%40class%3D%22mod%20yfi_quote_headline%20withsky%22%5D%2Ful%2Fli%2Fa'";
@Override
public int insertHeadline(Headline headline) {
throw new UnsupportedOperationException("This implementation does not support inserting headlines.");
}
@Override
public int[] insertHeadlines(List<Headline> headline) {
throw new UnsupportedOperationException("This implementation does not support inserting headlines.");
}
@Override
public List<Headline> getHeadlines(String stock, Date date) {
List<Headline> headlineList = new ArrayList<Headline>();
@@ -57,7 +62,6 @@ public class YahooHeadlineServiceImpl implements HeadlineService {
while ((line = buf.readLine()) != null) {
sb.append(line);
//System.out.println(line);
}
buf.close();
@@ -67,7 +71,6 @@ public class YahooHeadlineServiceImpl implements HeadlineService {
Pattern pattern = Pattern.compile("<a>.*?</a>");
Matcher matcher = pattern.matcher(xmlResults);
while (matcher.find()) {
System.out.println();
String anchorValue = xmlResults.substring(matcher.start()+3,matcher.end()-4);
headlineList.add(new Headline(stock,anchorValue,date,1));
}