From 9a48b4527670ff56b8529c01aa961873bbd610d5 Mon Sep 17 00:00:00 2001 From: "Nicholaus E. Halecky" Date: Mon, 28 Jan 2013 16:30:04 -0800 Subject: [PATCH 1/3] EHN: Expand Yahoo finance features, idx components --- pandas/io/data.py | 182 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 173 insertions(+), 9 deletions(-) diff --git a/pandas/io/data.py b/pandas/io/data.py index e4457d141e92c..964a58739b7e6 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -3,6 +3,7 @@ """ +import warnings import numpy as np import datetime as dt @@ -13,7 +14,7 @@ from zipfile import ZipFile from pandas.util.py3compat import StringIO, BytesIO, bytes_to_str -from pandas import DataFrame, read_csv, concat +from pandas import Panel, DataFrame, Series, read_csv, concat from pandas.io.parsers import TextParser @@ -54,7 +55,8 @@ def DataReader(name, data_source=None, start=None, end=None, start, end = _sanitize_dates(start, end) if(data_source == "yahoo"): - return get_data_yahoo(name=name, start=start, end=end, + return get_data_yahoo(symbols=name, start=start, end=end, + adjust_price=False, chunk=25, retry_count=retry_count, pause=pause) elif(data_source == "fred"): return get_data_fred(name=name, start=start, end=end) @@ -73,14 +75,27 @@ def _sanitize_dates(start, end): return start, end +def _in_chunks(seq, size): + """ + Return sequence in 'chunks' of size defined by size + """ + return (seq[pos:pos + size] for pos in xrange(0, len(seq), size)) + + def get_quote_yahoo(symbols): """ Get current yahoo quote Returns a DataFrame """ - if not isinstance(symbols, list): - raise TypeError("symbols must be a list") + if isinstance(symbols, str): + sym_list = symbols + elif not isinstance(symbols, Series): + symbols = Series(symbols) + sym_list = str.join('+', symbols) + else: + sym_list = str.join('+', symbols) + # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r', 'time': 't1', 'short_ratio': 's7'} @@ -90,7 +105,7 @@ def get_quote_yahoo(symbols): data = dict(zip(codes.keys(), [[] for i in range(len(codes))])) urlStr = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % ( - str.join('+', symbols), request) + sym_list, request) try: lines = urllib2.urlopen(urlStr).readlines() @@ -117,19 +132,20 @@ def get_quote_yahoo(symbols): return DataFrame(data, index=idx) -def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0): +def _get_hist_yahoo(name=None, start=None, end=None, retry_count=3, + pause=0): """ Get historical data for the given name from yahoo. Date format is datetime Returns a DataFrame. """ - start, end = _sanitize_dates(start, end) - if(name is None): - print "Need to provide a name" + warnings.warn("Need to provide a name.") return None + start, end = _sanitize_dates(start, end) + yahoo_URL = 'http://ichart.yahoo.com/table.csv?' url = yahoo_URL + 's=%s' % name + \ @@ -162,6 +178,154 @@ def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0): "return a 200 for url %s" % (pause, url)) +def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']): + """ + Return modifed DataFrame or Panel with adjusted prices based on + 'Adj Close' price. Adds 'Adj_Ratio' column. + """ + adj_ratio = hist_data['Adj Close'] / hist_data['Close'] + + data = hist_data.copy() + for item in price_list: + data[item] = hist_data[item] * adj_ratio + data['Adj_Ratio'] = adj_ratio + del data['Adj Close'] + return data + + +def _calc_return_index(price_df): + """ + Return a returns index from a input price df or series. + """ + + ret_index = price_df.pct_change().add(1).cumprod() + ret_index.ix[0] = 1 + return ret_index + + +def get_components_yahoo(idx_sym='^DJI'): + """ + Returns DataFrame containing list of component information for index + represented in idx_sym from yahoo. Includes component symbol + (ticker), exchange, and name. + + Parameters + ---------- + idx_sym : str + Index symbol, default '^DJI' (Dow Jones Industrial Average) + Examples: + '^NYA' (NYSE Composite) + '^IXIC' (NASDAQ Composite) + + See: http://finance.yahoo.com/indices for other index symbols + + Returns + ------- + idx_df : DataFrame + """ + stats = 'snx' + #URL of form: + #http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv + url = 'http://download.finance.yahoo.com/d/quotes.csv?s={0}&f={1}' \ + '&e=.csv&h={2}' + + idx_mod = idx_sym.replace('^', '@%5E') + urlStr = url.format(idx_mod, stats, 1) + + idx_df = DataFrame() + mask = [True] + comp_idx = 1 + + #LOOP across component index structure, + #break when no new components are found + while (True in mask): + urlStr = url.format(idx_mod, stats, comp_idx) + lines = (urllib.urlopen(urlStr).read().strip(). + strip('"').split('"\r\n"')) + + lines = [line.strip().split('","') for line in lines] + + temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange']) + temp_df = temp_df.drop_duplicates() + temp_df = temp_df.set_index('ticker') + mask = ~temp_df.index.isin(idx_df.index) + + comp_idx = comp_idx + 50 + idx_df = idx_df.append(temp_df[mask]) + + return idx_df + + +def get_data_yahoo(symbols=None, start=None, end=None, adjust_price=False, + ret_index=False, chunk=25, pause=0, **kwargs): + """ + Returns DataFrame/Panel of historical stock prices from symbols, over date + range, start to end. To avoid being penalized by Yahoo! Finance servers, + pauses between downloading 'chunks' of symbols can be specified. + + Parameters + ---------- + symbols : string, list-like object (list, tupel, Series), DataFrame + Single stock symbol (ticker), list-like object of symbols or + DataFrame with index containing of stock symbols + start : string, (defaults to '1/1/2010') + Starting date, timestamp. Parses many different kind of date + representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') + end : string, (defaults to today) + Ending date, timestamp. Same format as starting date. + adjust_price : bool, default False + Adjust all prices in hist_data ('Open', 'High', 'Low', 'Close') via + 'Adj Close' price. Adds 'Adj_Ratio' column and drops 'Adj Close'. + ret_index: bool, default False + Include a simple return index 'Ret_Index' in hist_data. + chunk : int, default 25 + Number of symbols to download consecutively before intiating pause. + pause : int, default 0 + Time, in seconds, to pause between consecutive chunks. + **kwargs: additional arguments to pass to _get_hist_yahoo + + Returns + ------- + hist_data : DataFrame (str) or Panel (list-like object, DataFrame) + """ + def dl_mult_symbols(symbols): + stocks = {} + for sym_group in _in_chunks(symbols, chunk): + for sym in sym_group: + try: + stocks[sym] = _get_hist_yahoo(name=sym, start=start, + end=end, **kwargs) + except: + warnings.warn('Error with sym: ' + sym + '... skipping.') + + time.sleep(pause) + + return Panel(stocks).swapaxes('items', 'minor') + + #If a scalar (single symbol, e.g. 'GOOG') + if isinstance(symbols, (str, int)): + sym = symbols + hist_data = _get_hist_yahoo(sym, start=start, end=end, **kwargs) + #Multiple symbols + elif isinstance(symbols, DataFrame): + try: + hist_data = dl_mult_symbols(Series(symbols.index)) + except ValueError: + raise + else: #Guess a Series + try: + hist_data = dl_mult_symbols(symbols) + except TypeError: + hist_data = dl_mult_symbols(Series(symbols)) + + if(ret_index): + hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close']) + if(adjust_price): + hist_data = _adjust_prices(hist_data) + + return hist_data + + def get_data_fred(name=None, start=dt.datetime(2010, 1, 1), end=dt.datetime.today()): """ From 9b59f92fd4427e263eaaec6c9a245af06d0451d5 Mon Sep 17 00:00:00 2001 From: "Nicholaus E. Halecky" Date: Sat, 2 Feb 2013 14:45:29 -0800 Subject: [PATCH 2/3] TST: Expanded test coverage of yahoo finance funcs --- pandas/io/tests/test_yahoo.py | 90 ++++++++++++++++++++++++++++++----- 1 file changed, 77 insertions(+), 13 deletions(-) diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py index 89c650316468c..f705eddff5f56 100644 --- a/pandas/io/tests/test_yahoo.py +++ b/pandas/io/tests/test_yahoo.py @@ -1,14 +1,16 @@ -from pandas.util.py3compat import StringIO, BytesIO -from datetime import datetime -import csv -import os -import sys -import re import unittest -import pandas.io.data as pd import nose -from pandas.util.testing import network +from datetime import datetime + +from pandas.util.py3compat import StringIO, BytesIO + +import pandas as pd +import pandas.io.data as web +from pandas.util.testing import (network, assert_frame_equal, + assert_series_equal, + assert_almost_equal) from numpy.testing.decorators import slow + import urllib2 @@ -21,16 +23,16 @@ def test_yahoo(self): # an excecption when DataReader can't get a 200 response from # yahoo start = datetime(2010, 1, 1) - end = datetime(2012, 1, 24) + end = datetime(2013, 01, 27) try: self.assertEquals( - pd.DataReader("F", 'yahoo', start, end)['Close'][-1], - 12.82) + web.DataReader("F", 'yahoo', start, end)['Close'][-1], + 13.68) self.assertRaises( Exception, - lambda: pd.DataReader("NON EXISTENT TICKER", 'yahoo', + lambda: web.DataReader("NON EXISTENT TICKER", 'yahoo', start, end)) except urllib2.URLError: try: @@ -40,7 +42,69 @@ def test_yahoo(self): else: raise + @slow + @network + def test_get_quote(self): + df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG'])) + assert_series_equal(df.ix[0], df.ix[2]) + + @slow + @network + def test_get_components(self): + + df = web.get_components_yahoo() #Dow Jones (default) + assert isinstance(df, pd.DataFrame) + assert len(df) == 30 + + df = web.get_components_yahoo('^GDAXI') #DAX + assert isinstance(df, pd.DataFrame) + assert len(df) == 30 + assert df[df.name.str.contains('adidas', case=False)].index == 'ADS.DE' + + df = web.get_components_yahoo('^NDX') #NASDAQ-100 + assert isinstance(df, pd.DataFrame) + assert len(df) == 100 + #Usual culprits, should be around for a while + assert 'AAPL' in df.index + assert 'GOOG' in df.index + assert 'AMZN' in df.index + + @slow + @network + def test_get_data(self): + #single symbol + #http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d + df = web.get_data_yahoo('GOOG') + assert df.Volume.ix['OCT-08-2010'] == 2859200 + + sl = ['AAPL', 'AMZN', 'GOOG'] + pan = web.get_data_yahoo(sl, '2012') + ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG] + assert ts[0].dayofyear == 96 + + dfi = web.get_components_yahoo('^DJI') + pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-13') + expected = [19.02, 28.23, 25.39] + result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist() + assert result == expected + + pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-13', + adjust_price=True) + expected = [18.38, 27.45, 24.54] + result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist() + assert result == expected + + pan = web.get_data_yahoo(dfi, '2011', ret_index=True) + d = [[ 1.31810193, 1.08170606, 1.05281026], + [ 1.31810193, 1.09352518, 1.05658242], + [ 1.30228471, 1.09815005, 1.05054696], + [ 1.30521383, 1.08119219, 1.03545832]] + + expected = pd.DataFrame(d) + result = pan.Ret_Index[['GE', 'INTC', 'MSFT']].ix[-5:-1] + assert_almost_equal(result.values, expected.values) + + if __name__ == '__main__': - import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 770810461df6ee6fd02bc49618e228ca5f483e72 Mon Sep 17 00:00:00 2001 From: "Nicholaus E. Halecky" Date: Tue, 5 Feb 2013 00:33:30 -0800 Subject: [PATCH 3/3] BUG: Fix backwards compatibility in get_data_yahoo --- pandas/io/data.py | 60 ++++++++++++++++++++--------------- pandas/io/tests/test_yahoo.py | 20 ++++++------ 2 files changed, 46 insertions(+), 34 deletions(-) diff --git a/pandas/io/data.py b/pandas/io/data.py index 964a58739b7e6..2c08838596196 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -132,7 +132,7 @@ def get_quote_yahoo(symbols): return DataFrame(data, index=idx) -def _get_hist_yahoo(name=None, start=None, end=None, retry_count=3, +def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, pause=0): """ Get historical data for the given name from yahoo. @@ -140,7 +140,7 @@ def _get_hist_yahoo(name=None, start=None, end=None, retry_count=3, Returns a DataFrame. """ - if(name is None): + if(sym is None): warnings.warn("Need to provide a name.") return None @@ -148,7 +148,7 @@ def _get_hist_yahoo(name=None, start=None, end=None, retry_count=3, yahoo_URL = 'http://ichart.yahoo.com/table.csv?' - url = yahoo_URL + 's=%s' % name + \ + url = yahoo_URL + 's=%s' % sym + \ '&a=%s' % (start.month - 1) + \ '&b=%s' % start.day + \ '&c=%s' % start.year + \ @@ -203,17 +203,18 @@ def _calc_return_index(price_df): return ret_index -def get_components_yahoo(idx_sym='^DJI'): +def get_components_yahoo(idx_sym): """ - Returns DataFrame containing list of component information for index - represented in idx_sym from yahoo. Includes component symbol + Returns DataFrame containing list of component information for + index represented in idx_sym from yahoo. Includes component symbol (ticker), exchange, and name. Parameters ---------- idx_sym : str - Index symbol, default '^DJI' (Dow Jones Industrial Average) + Stock index symbol Examples: + '^DJI' (Dow Jones Industrial Average) '^NYA' (NYSE Composite) '^IXIC' (NASDAQ Composite) @@ -256,8 +257,8 @@ def get_components_yahoo(idx_sym='^DJI'): return idx_df -def get_data_yahoo(symbols=None, start=None, end=None, adjust_price=False, - ret_index=False, chunk=25, pause=0, **kwargs): +def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0, + adjust_price=False, ret_index=False, chunksize=25, **kwargs): """ Returns DataFrame/Panel of historical stock prices from symbols, over date range, start to end. To avoid being penalized by Yahoo! Finance servers, @@ -265,35 +266,39 @@ def get_data_yahoo(symbols=None, start=None, end=None, adjust_price=False, Parameters ---------- - symbols : string, list-like object (list, tupel, Series), DataFrame + symbols : string, list-like object (list, tupel, Series), or DataFrame Single stock symbol (ticker), list-like object of symbols or - DataFrame with index containing of stock symbols + DataFrame with index containing stock symbols. start : string, (defaults to '1/1/2010') Starting date, timestamp. Parses many different kind of date representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') - end : string, (defaults to today) + end : string, (defaults to today) Ending date, timestamp. Same format as starting date. + retry_count : int, default 3 + Number of times to retry query request. + pause : int, default 0 + Time, in seconds, to pause between consecutive queries of chunks. If + single value given for symbol, represents the pause between retries. adjust_price : bool, default False - Adjust all prices in hist_data ('Open', 'High', 'Low', 'Close') via - 'Adj Close' price. Adds 'Adj_Ratio' column and drops 'Adj Close'. - ret_index: bool, default False - Include a simple return index 'Ret_Index' in hist_data. - chunk : int, default 25 + If True, adjusts all prices in hist_data ('Open', 'High', 'Low', 'Close') + based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops + 'Adj Close'. + ret_index : bool, default False + If True, includes a simple return index 'Ret_Index' in hist_data. + chunksize : int, default 25 Number of symbols to download consecutively before intiating pause. - pause : int, default 0 - Time, in seconds, to pause between consecutive chunks. - **kwargs: additional arguments to pass to _get_hist_yahoo Returns ------- hist_data : DataFrame (str) or Panel (list-like object, DataFrame) """ + def dl_mult_symbols(symbols): stocks = {} - for sym_group in _in_chunks(symbols, chunk): + for sym_group in _in_chunks(symbols, chunksize): for sym in sym_group: try: - stocks[sym] = _get_hist_yahoo(name=sym, start=start, + stocks[sym] = _get_hist_yahoo(sym, start=start, end=end, **kwargs) except: warnings.warn('Error with sym: ' + sym + '... skipping.') @@ -302,11 +307,16 @@ def dl_mult_symbols(symbols): return Panel(stocks).swapaxes('items', 'minor') - #If a scalar (single symbol, e.g. 'GOOG') + if 'name' in kwargs: + warnings.warn("Arg 'name' is deprecated, please use 'symbols' instead.", + FutureWarning) + symbols = kwargs['name'] + + #If a single symbol, (e.g., 'GOOG') if isinstance(symbols, (str, int)): sym = symbols - hist_data = _get_hist_yahoo(sym, start=start, end=end, **kwargs) - #Multiple symbols + hist_data = _get_hist_yahoo(sym, start=start, end=end) + #Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT']) elif isinstance(symbols, DataFrame): try: hist_data = dl_mult_symbols(Series(symbols.index)) diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py index f705eddff5f56..1f25e3ccd165f 100644 --- a/pandas/io/tests/test_yahoo.py +++ b/pandas/io/tests/test_yahoo.py @@ -42,17 +42,19 @@ def test_yahoo(self): else: raise + @slow @network def test_get_quote(self): df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG'])) assert_series_equal(df.ix[0], df.ix[2]) + @slow @network def test_get_components(self): - df = web.get_components_yahoo() #Dow Jones (default) + df = web.get_components_yahoo('^DJI') #Dow Jones assert isinstance(df, pd.DataFrame) assert len(df) == 30 @@ -63,7 +65,7 @@ def test_get_components(self): df = web.get_components_yahoo('^NDX') #NASDAQ-100 assert isinstance(df, pd.DataFrame) - assert len(df) == 100 + #assert len(df) == 100 #Usual culprits, should be around for a while assert 'AAPL' in df.index assert 'GOOG' in df.index @@ -83,25 +85,25 @@ def test_get_data(self): assert ts[0].dayofyear == 96 dfi = web.get_components_yahoo('^DJI') - pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-13') + pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-12') expected = [19.02, 28.23, 25.39] result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist() assert result == expected - pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-13', + pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-12', adjust_price=True) expected = [18.38, 27.45, 24.54] result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist() assert result == expected pan = web.get_data_yahoo(dfi, '2011', ret_index=True) - d = [[ 1.31810193, 1.08170606, 1.05281026], - [ 1.31810193, 1.09352518, 1.05658242], - [ 1.30228471, 1.09815005, 1.05054696], - [ 1.30521383, 1.08119219, 1.03545832]] + d = [[ 1.01757469, 1.01130524, 1.02414183], + [ 1.00292912, 1.00770812, 1.01735194], + [ 1.00820152, 1.00462487, 1.01320257], + [ 1.08025776, 0.99845838, 1.00113165]] expected = pd.DataFrame(d) - result = pan.Ret_Index[['GE', 'INTC', 'MSFT']].ix[-5:-1] + result = pan.Ret_Index.ix['01-18-11':'01-21-11'][['GE', 'INTC', 'MSFT']] assert_almost_equal(result.values, expected.values)