Functional data import from Yahoo Finance news using YQL (Yahoo Query Language) and XPATH. Data is stuffed into MySQL database cs6601 on woodyfolsom.net.

This commit is contained in:
Woody Folsom
2012-04-07 18:59:39 -04:00
parent a46e790059
commit d700d97124
46 changed files with 610 additions and 482 deletions

View File

@@ -0,0 +1,98 @@
package net.woodyfolsom.cs6601.p3;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import net.woodyfolsom.cs6601.p3.domain.Company;
import net.woodyfolsom.cs6601.p3.domain.Headline;
import net.woodyfolsom.cs6601.p3.svc.HeadlineService;
import net.woodyfolsom.cs6601.p3.svc.YahooHeadlineServiceImpl;
@Component
public class HeadlinePuller {
private static final File stockSymbolsCSV = new File("stock_symbols.csv");
private static final int IO_EXCEPTION = 1;
private static final int STOCK_SYMBOL_CSV_NOT_FOUND = 2;
@Autowired
HeadlineService mySQLHeadlineServiceImpl;
@Autowired
HeadlineService yahooHeadlineServiceImpl;
public static void main(String... args) {
ApplicationContext context=new ClassPathXmlApplicationContext(new String[]{"/AppContext.xml"});
HeadlinePuller headlinePuller = context.getBean(HeadlinePuller.class);
try {
List<Company> fortune50 = headlinePuller.getFortune50(stockSymbolsCSV);
for (Company company : fortune50) {
System.out.println("Getting headlines for Fortune 50 company #" + company.getId() + " (" + company.getName() + ")...");
Date today = new Date();
List<Headline> headlines = headlinePuller.pullHeadlines(company.getStockSymbol(), today);
headlinePuller.insertHeadlines(company.getStockSymbol(), today, headlines);
System.out.println("Waiting 10 seconds to accommodate Yahoo throttling...");
try {
Thread.sleep(10000L);
} catch (InterruptedException ie) {
System.out.println("Interrupted while waiting, exiting");
System.exit(0);
}
}
} catch (FileNotFoundException fnfe) {
System.out.println("Stock symbol CSV file does not exist: "
+ stockSymbolsCSV);
System.exit(STOCK_SYMBOL_CSV_NOT_FOUND);
} catch (IOException ioe) {
System.out.println("Stock symbol CSV file does not exist: "
+ stockSymbolsCSV);
System.exit(IO_EXCEPTION);
}
}
private void insertHeadlines(String stockSymbol, Date date, List<Headline> headlines) {
for (Headline headline : headlines) {
mySQLHeadlineServiceImpl.insertHeadline(headline);
}
}
private List<Headline> pullHeadlines(String stockSymbol, Date date) {
List<Headline> headlines = yahooHeadlineServiceImpl.getHeadlines(stockSymbol, date);
for (Headline headline : headlines) {
System.out.println("Got headline: " + headline);
}
return headlines;
}
private List<Company> getFortune50(File csvFile) throws FileNotFoundException,
IOException {
List<Company> fortune50 = new ArrayList<Company>();
FileInputStream fis = new FileInputStream(csvFile);
InputStreamReader reader = new InputStreamReader(fis);
BufferedReader buf = new BufferedReader(reader);
String csvline = null;
while ((csvline = buf.readLine()) != null) {
if (csvline.length() == 0) {
continue;
}
String[] fields = csvline.split(",");
if (fields.length != 3) {
throw new RuntimeException("Badly formatted csv file name (3 values expected): " + csvline);
}
int id = Integer.valueOf(fields[0]);
fortune50.add(new Company(id,fields[1],fields[2]));
}
return fortune50;
}
}

View File

@@ -0,0 +1,15 @@
package net.woodyfolsom.cs6601.p3.dao;
import java.util.Date;
import java.util.List;
import net.woodyfolsom.cs6601.p3.domain.Headline;
public interface HeadlineDao {
int deleteById(int id);
int insert(Headline player);
Headline select(int id);
List<Headline> select(String stock, Date date);
}

View File

@@ -0,0 +1,61 @@
package net.woodyfolsom.cs6601.p3.dao;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Date;
import java.util.List;
import javax.sql.DataSource;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.jdbc.core.simple.ParameterizedRowMapper;
import org.springframework.stereotype.Repository;
import net.woodyfolsom.cs6601.p3.domain.Headline;
@Repository
public class HeadlineDaoImpl implements HeadlineDao {
private static final String DELETE_BY_ID_STMT = "DELETE from headlines WHERE id = ?";
private static final String INSERT_STMT = "INSERT INTO headlines (text, date, stock, dataset) values (?, ?, ?, ?)";
private static final String SELECT_BY_ID_QRY = "SELECT * from headlines WHERE id = ?";
private static final String SELECT_BY_STOCK_QRY = "SELECT * from headlines WHERE stock = ? AND date = ?";
private JdbcTemplate jdbcTemplate;
public int deleteById(int headlineId) {
return jdbcTemplate.update(DELETE_BY_ID_STMT,
new RequestMapper(), headlineId);
}
public int insert(Headline headline) {
return jdbcTemplate.update(INSERT_STMT, headline.getText(), headline.getDate(), headline.getStock(), headline.getDataset());
}
public Headline select(int headlineId) {
return jdbcTemplate.queryForObject(SELECT_BY_ID_QRY,
new RequestMapper(), headlineId);
}
public List<Headline> select(String stock, Date date) {
return jdbcTemplate.query(SELECT_BY_STOCK_QRY,
new RequestMapper(), stock, date);
}
@Autowired
public void createTemplate(DataSource dataSource) {
this.jdbcTemplate = new JdbcTemplate(dataSource);
}
private class RequestMapper implements ParameterizedRowMapper<Headline> {
@Override
public Headline mapRow(ResultSet rs, int arg1) throws SQLException {
Headline headline = new Headline();
return headline;
}
}
}

View File

@@ -0,0 +1,32 @@
package net.woodyfolsom.cs6601.p3.domain;
public class Company {
private int id;
private String name;
private String stockSymbol;
public Company(int id, String name, String stockSymbol) {
this.id = id;
this.name = name;
this.stockSymbol = stockSymbol;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getStockSymbol() {
return stockSymbol;
}
public void setStockSymbol(String stockSymbol) {
this.stockSymbol = stockSymbol;
}
}

View File

@@ -0,0 +1,68 @@
package net.woodyfolsom.cs6601.p3.domain;
import java.util.Date;
public class Headline {
public Headline() {
}
private int dataset;
private int id;
private Date date;
private String stock;
private String text;
public Headline(String stock, String text, Date date, int dataset) {
this.stock = stock;
this.text = text;
this.date = date;
this.dataset = dataset;
}
public int getDataset() {
return dataset;
}
public void setDataset(int dataset) {
this.dataset = dataset;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public Date getDate() {
return date;
}
public void setDate(Date date) {
this.date = date;
}
public String getStock() {
return stock;
}
public void setStock(String stock) {
this.stock = stock;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
@Override
public String toString() {
return text;
}
}

View File

@@ -0,0 +1,11 @@
package net.woodyfolsom.cs6601.p3.svc;
import java.util.Date;
import java.util.List;
import net.woodyfolsom.cs6601.p3.domain.Headline;
public interface HeadlineService {
int insertHeadline(Headline headline);
List<Headline> getHeadlines(String stock, Date date);
}

View File

@@ -0,0 +1,30 @@
package net.woodyfolsom.cs6601.p3.svc;
import java.util.Date;
import java.util.List;
import net.woodyfolsom.cs6601.p3.dao.HeadlineDao;
import net.woodyfolsom.cs6601.p3.domain.Headline;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
@Service
public class MySQLHeadlineServiceImpl implements HeadlineService {
private Log log = LogFactory.getLog(MySQLHeadlineServiceImpl.class);
@Autowired
private HeadlineDao headlineDao;
@Override
public int insertHeadline(Headline headline) {
return headlineDao.insert(headline);
}
@Override
public List<Headline> getHeadlines(String stock, Date date) {
return headlineDao.select(stock, date);
}
}

View File

@@ -0,0 +1,86 @@
package net.woodyfolsom.cs6601.p3.svc;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.woodyfolsom.cs6601.p3.dao.HeadlineDao;
import net.woodyfolsom.cs6601.p3.domain.Headline;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
@Service
public class YahooHeadlineServiceImpl implements HeadlineService {
private Log log = LogFactory.getLog(YahooHeadlineServiceImpl.class);
private static final DateFormat DATE_FORMATTER = new SimpleDateFormat("yyyy-MM-dd");
private static final String STORY_DATE_FIELD = "STORY_DATE";
private static final String STOCK_SYMBOL_FIELD = "STOCK_SYMBOL";
private static final String QUERY_URL = "http://query.yahooapis.com/v1/public/yql?q=select%20content%20from%20html%20where%20url%3D%22http%3A%2F%2Ffinance.yahoo.com%2Fq%2Fh%3Fs%3DSTOCK_SYMBOL%26t%3DSTORY_DATE%22%20and%20xpath%3D'%2F%2Fdiv%5B%40class%3D%22mod%20yfi_quote_headline%20withsky%22%5D%2Ful%2Fli%2Fa'&diagnostics=true";
@Override
public int insertHeadline(Headline headline) {
throw new UnsupportedOperationException("This implementation does not support inserting headlines.");
}
@Override
public List<Headline> getHeadlines(String stock, Date date) {
List<Headline> headlineList = new ArrayList<Headline>();
try {
URL url = new URL(populateQueryURL(stock,date));
HttpURLConnection connection = (HttpURLConnection)url.openConnection();
connection.setRequestMethod("GET");
connection.setReadTimeout(10000);
connection.connect();
BufferedReader buf = new BufferedReader(new InputStreamReader(connection.getInputStream()));
String line;
StringBuilder sb = new StringBuilder();
while ((line = buf.readLine()) != null) {
sb.append(line);
//System.out.println(line);
}
buf.close();
String xmlResults = sb.toString();
Pattern pattern = Pattern.compile("<a>.*?</a>");
Matcher matcher = pattern.matcher(xmlResults);
while (matcher.find()) {
System.out.println();
String anchorValue = xmlResults.substring(matcher.start()+3,matcher.end()-4);
headlineList.add(new Headline(stock,anchorValue,date,1));
}
} catch (MalformedURLException mue) {
log.warn("Caught MalformedURLException: " + mue.getMessage() + ", returning empty Headline list.");
} catch (IOException ioe) {
log.warn("Caught IOException: " + ioe.getMessage() + ", returning empty Headline list.");
}
return headlineList;
}
private String populateQueryURL(String stock, Date date) {
String formattedDate = DATE_FORMATTER.format(date);
return QUERY_URL.replaceAll(STOCK_SYMBOL_FIELD, stock).replaceAll(STORY_DATE_FIELD, formattedDate);
}
}