HeadlinePuller can import all news articles for Fortune 50 within a date range from Yahoo Finance historical data, inserting the headline text into the headlines table of database cs6601p3 on woodyfolsom.net:3306.

Limited to 25 headline per day (with many reptitions) per Yahoo Finance REST functionality.
This commit is contained in:
Woody Folsom
2012-04-15 10:18:02 -04:00
parent d700d97124
commit 027adff2dd
40 changed files with 322 additions and 39 deletions

View File

@@ -6,47 +6,119 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.context.support.FileSystemXmlApplicationContext;
import org.springframework.stereotype.Component;
import net.woodyfolsom.cs6601.p3.domain.Company;
import net.woodyfolsom.cs6601.p3.domain.Headline;
import net.woodyfolsom.cs6601.p3.svc.HeadlineService;
import net.woodyfolsom.cs6601.p3.svc.YahooHeadlineServiceImpl;
@Component
public class HeadlinePuller {
private static final File stockSymbolsCSV = new File("stock_symbols.csv");
private static final int IO_EXCEPTION = 1;
private static final int STOCK_SYMBOL_CSV_NOT_FOUND = 2;
private static final int INVALID_END_DATE = 1;
private static final int INVALID_MODE = 2;
private static final int INVALID_START_DATE = 3;
private static final int IO_EXCEPTION = 4;
private static final int NO_ARGS = 5;
private static final int STOCK_SYMBOL_CSV_NOT_FOUND = 6;
@Autowired
HeadlineService mySQLHeadlineServiceImpl;
@Autowired
HeadlineService yahooHeadlineServiceImpl;
private static void printUsage() {
System.out
.println("Usage: java -jar cs6601p3.jar [insert|delete] mm/dd/yyyy-mm/dd/yyyy");
}
private enum MODE {
insert, invalid, delete
}
public static void main(String... args) {
ApplicationContext context=new ClassPathXmlApplicationContext(new String[]{"/AppContext.xml"});
HeadlinePuller headlinePuller = context.getBean(HeadlinePuller.class);
try {
List<Company> fortune50 = headlinePuller.getFortune50(stockSymbolsCSV);
for (Company company : fortune50) {
System.out.println("Getting headlines for Fortune 50 company #" + company.getId() + " (" + company.getName() + ")...");
Date today = new Date();
List<Headline> headlines = headlinePuller.pullHeadlines(company.getStockSymbol(), today);
headlinePuller.insertHeadlines(company.getStockSymbol(), today, headlines);
System.out.println("Waiting 10 seconds to accommodate Yahoo throttling...");
try {
Thread.sleep(10000L);
} catch (InterruptedException ie) {
System.out.println("Interrupted while waiting, exiting");
MODE mode = MODE.invalid;
if (args.length != 2) {
printUsage();
System.exit(NO_ARGS);
} else {
try {
mode = MODE.valueOf(args[0]);
} catch (Exception ex) {
System.out.println("Invalid mode: " + args[0]);
}
}
if (mode == MODE.invalid) {
System.exit(INVALID_MODE);
}
if (mode == MODE.delete) {
System.out.println("Mode = delete. All data will be purged from HEADLINES table. Continue? [y/n]");
byte[] buf = new byte[10];
try {
int read = System.in.read(buf,0,10);
String conf = new String(buf,0,read,Charset.defaultCharset());
System.out.println("CONF = '" + conf +"'");
if (conf.charAt(0) == 'y') {
System.out.println("Delete mode confirmed. Continuing...");
System.exit(0);
} else {
System.out.println("Delete mode cancelled.");
System.exit(0);
}
} catch (IOException ioe) {
System.exit(IO_EXCEPTION);
}
}
String[] dateFields = args[1].split("-");
DateFormat dateFormat = new SimpleDateFormat("MM/dd/yyyy");
Date startDate = null;
try {
startDate = dateFormat.parse(dateFields[0]);
} catch (ParseException pe) {
System.out.println("Invalid start date: " + dateFields[0]);
System.exit(INVALID_START_DATE);
}
Date endDate = null;
try {
endDate = dateFormat.parse(dateFields[1]);
} catch (ParseException pe) {
System.out.println("Invalid end date: " + dateFields[0]);
System.exit(INVALID_END_DATE);
}
ApplicationContext context = new FileSystemXmlApplicationContext(
new String[] { "AppContext.xml" });
HeadlinePuller headlinePuller = context.getBean(HeadlinePuller.class);
Calendar calendar = Calendar.getInstance();
try {
List<Company> fortune50 = headlinePuller
.getFortune50(stockSymbolsCSV);
for (Company company : fortune50) {
System.out.println("Getting headlines for Fortune 50 company #"
+ company.getId() + " (" + company.getName() + ")...");
Date today;
for (calendar.setTime(startDate); (today = calendar.getTime())
.compareTo(endDate) <= 0; calendar
.add(Calendar.DATE, 1)) {
List<Headline> headlines = headlinePuller.pullHeadlines(
company.getStockSymbol(), today);
int[] updates = headlinePuller.mySQLHeadlineServiceImpl.insertHeadlines(headlines);
System.out.println(updates.length + " rows updated");
}
}
} catch (FileNotFoundException fnfe) {
@@ -60,23 +132,15 @@ public class HeadlinePuller {
}
}
private void insertHeadlines(String stockSymbol, Date date, List<Headline> headlines) {
for (Headline headline : headlines) {
mySQLHeadlineServiceImpl.insertHeadline(headline);
}
}
private List<Headline> pullHeadlines(String stockSymbol, Date date) {
List<Headline> headlines = yahooHeadlineServiceImpl.getHeadlines(stockSymbol, date);
for (Headline headline : headlines) {
System.out.println("Got headline: " + headline);
}
List<Headline> headlines = yahooHeadlineServiceImpl.getHeadlines(
stockSymbol, date);
System.out.println("Pulled " + headlines.size() + " headlines for " + stockSymbol + " on " + date);
return headlines;
}
private List<Company> getFortune50(File csvFile) throws FileNotFoundException,
IOException {
private List<Company> getFortune50(File csvFile)
throws FileNotFoundException, IOException {
List<Company> fortune50 = new ArrayList<Company>();
FileInputStream fis = new FileInputStream(csvFile);
InputStreamReader reader = new InputStreamReader(fis);
@@ -88,10 +152,12 @@ public class HeadlinePuller {
}
String[] fields = csvline.split(",");
if (fields.length != 3) {
throw new RuntimeException("Badly formatted csv file name (3 values expected): " + csvline);
throw new RuntimeException(
"Badly formatted csv file name (3 values expected): "
+ csvline);
}
int id = Integer.valueOf(fields[0]);
fortune50.add(new Company(id,fields[1],fields[2]));
fortune50.add(new Company(id, fields[1], fields[2]));
}
return fortune50;
}

View File

@@ -8,7 +8,8 @@ import net.woodyfolsom.cs6601.p3.domain.Headline;
public interface HeadlineDao {
int deleteById(int id);
int insert(Headline player);
int insert(Headline headline);
int[] insertBatch(List<Headline> headlines);
Headline select(int id);
List<Headline> select(String stock, Date date);

View File

@@ -1,5 +1,6 @@
package net.woodyfolsom.cs6601.p3.dao;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Date;
@@ -8,6 +9,7 @@ import java.util.List;
import javax.sql.DataSource;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.jdbc.core.BatchPreparedStatementSetter;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.jdbc.core.simple.ParameterizedRowMapper;
import org.springframework.stereotype.Repository;
@@ -34,6 +36,26 @@ public class HeadlineDaoImpl implements HeadlineDao {
return jdbcTemplate.update(INSERT_STMT, headline.getText(), headline.getDate(), headline.getStock(), headline.getDataset());
}
public int[] insertBatch(final List<Headline> headlines){
return jdbcTemplate.batchUpdate(INSERT_STMT, new BatchPreparedStatementSetter() {
@Override
public void setValues(PreparedStatement ps, int i) throws SQLException {
Headline headline = headlines.get(i);
ps.setString(1, headline.getText());
ps.setDate(2, new java.sql.Date(headline.getDate().getTime()));
ps.setString(3, headline.getStock() );
ps.setInt(4, headline.getDataset() );
}
@Override
public int getBatchSize() {
return headlines.size();
}
});
}
public Headline select(int headlineId) {
return jdbcTemplate.queryForObject(SELECT_BY_ID_QRY,
new RequestMapper(), headlineId);
@@ -54,6 +76,10 @@ public class HeadlineDaoImpl implements HeadlineDao {
@Override
public Headline mapRow(ResultSet rs, int arg1) throws SQLException {
Headline headline = new Headline();
headline.setText(rs.getString("text"));
headline.setStock(rs.getString("stock"));
headline.setDate(rs.getDate("date"));
headline.setDataset(rs.getInt("dataset"));
return headline;
}

View File

@@ -7,5 +7,6 @@ import net.woodyfolsom.cs6601.p3.domain.Headline;
public interface HeadlineService {
int insertHeadline(Headline headline);
int[] insertHeadlines(List<Headline> headline);
List<Headline> getHeadlines(String stock, Date date);
}

View File

@@ -23,6 +23,11 @@ public class MySQLHeadlineServiceImpl implements HeadlineService {
return headlineDao.insert(headline);
}
@Override
public int[] insertHeadlines(List<Headline> headlines) {
return headlineDao.insertBatch(headlines);
}
@Override
public List<Headline> getHeadlines(String stock, Date date) {
return headlineDao.select(stock, date);

View File

@@ -31,13 +31,18 @@ public class YahooHeadlineServiceImpl implements HeadlineService {
private static final String STORY_DATE_FIELD = "STORY_DATE";
private static final String STOCK_SYMBOL_FIELD = "STOCK_SYMBOL";
private static final String QUERY_URL = "http://query.yahooapis.com/v1/public/yql?q=select%20content%20from%20html%20where%20url%3D%22http%3A%2F%2Ffinance.yahoo.com%2Fq%2Fh%3Fs%3DSTOCK_SYMBOL%26t%3DSTORY_DATE%22%20and%20xpath%3D'%2F%2Fdiv%5B%40class%3D%22mod%20yfi_quote_headline%20withsky%22%5D%2Ful%2Fli%2Fa'&diagnostics=true";
private static final String QUERY_URL = "http://query.yahooapis.com/v1/public/yql?q=select%20content%20from%20html%20where%20url%3D%22http%3A%2F%2Ffinance.yahoo.com%2Fq%2Fh%3Fs%3DSTOCK_SYMBOL%26t%3DSTORY_DATE%22%20and%20xpath%3D'%2F%2Fdiv%5B%40class%3D%22mod%20yfi_quote_headline%20withsky%22%5D%2Ful%2Fli%2Fa'";
@Override
public int insertHeadline(Headline headline) {
throw new UnsupportedOperationException("This implementation does not support inserting headlines.");
}
@Override
public int[] insertHeadlines(List<Headline> headline) {
throw new UnsupportedOperationException("This implementation does not support inserting headlines.");
}
@Override
public List<Headline> getHeadlines(String stock, Date date) {
List<Headline> headlineList = new ArrayList<Headline>();
@@ -57,7 +62,6 @@ public class YahooHeadlineServiceImpl implements HeadlineService {
while ((line = buf.readLine()) != null) {
sb.append(line);
//System.out.println(line);
}
buf.close();
@@ -67,7 +71,6 @@ public class YahooHeadlineServiceImpl implements HeadlineService {
Pattern pattern = Pattern.compile("<a>.*?</a>");
Matcher matcher = pattern.matcher(xmlResults);
while (matcher.find()) {
System.out.println();
String anchorValue = xmlResults.substring(matcher.start()+3,matcher.end()-4);
headlineList.add(new Headline(stock,anchorValue,date,1));
}