diff --git a/pandas/io/data.py b/pandas/io/data.py index e4457d141e92c..2c08838596196 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -3,6 +3,7 @@ """ +import warnings import numpy as np import datetime as dt @@ -13,7 +14,7 @@ from zipfile import ZipFile from pandas.util.py3compat import StringIO, BytesIO, bytes_to_str -from pandas import DataFrame, read_csv, concat +from pandas import Panel, DataFrame, Series, read_csv, concat from pandas.io.parsers import TextParser @@ -54,7 +55,8 @@ def DataReader(name, data_source=None, start=None, end=None, start, end = _sanitize_dates(start, end) if(data_source == "yahoo"): - return get_data_yahoo(name=name, start=start, end=end, + return get_data_yahoo(symbols=name, start=start, end=end, + adjust_price=False, chunk=25, retry_count=retry_count, pause=pause) elif(data_source == "fred"): return get_data_fred(name=name, start=start, end=end) @@ -73,14 +75,27 @@ def _sanitize_dates(start, end): return start, end +def _in_chunks(seq, size): + """ + Return sequence in 'chunks' of size defined by size + """ + return (seq[pos:pos + size] for pos in xrange(0, len(seq), size)) + + def get_quote_yahoo(symbols): """ Get current yahoo quote Returns a DataFrame """ - if not isinstance(symbols, list): - raise TypeError("symbols must be a list") + if isinstance(symbols, str): + sym_list = symbols + elif not isinstance(symbols, Series): + symbols = Series(symbols) + sym_list = str.join('+', symbols) + else: + sym_list = str.join('+', symbols) + # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r', 'time': 't1', 'short_ratio': 's7'} @@ -90,7 +105,7 @@ def get_quote_yahoo(symbols): data = dict(zip(codes.keys(), [[] for i in range(len(codes))])) urlStr = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % ( - str.join('+', symbols), request) + sym_list, request) try: lines = urllib2.urlopen(urlStr).readlines() @@ -117,22 +132,23 @@ def get_quote_yahoo(symbols): return DataFrame(data, index=idx) -def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0): +def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, + pause=0): """ Get historical data for the given name from yahoo. Date format is datetime Returns a DataFrame. """ - start, end = _sanitize_dates(start, end) - - if(name is None): - print "Need to provide a name" + if(sym is None): + warnings.warn("Need to provide a name.") return None + start, end = _sanitize_dates(start, end) + yahoo_URL = 'http://ichart.yahoo.com/table.csv?' - url = yahoo_URL + 's=%s' % name + \ + url = yahoo_URL + 's=%s' % sym + \ '&a=%s' % (start.month - 1) + \ '&b=%s' % start.day + \ '&c=%s' % start.year + \ @@ -162,6 +178,164 @@ def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0): "return a 200 for url %s" % (pause, url)) +def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']): + """ + Return modifed DataFrame or Panel with adjusted prices based on + 'Adj Close' price. Adds 'Adj_Ratio' column. + """ + adj_ratio = hist_data['Adj Close'] / hist_data['Close'] + + data = hist_data.copy() + for item in price_list: + data[item] = hist_data[item] * adj_ratio + data['Adj_Ratio'] = adj_ratio + del data['Adj Close'] + return data + + +def _calc_return_index(price_df): + """ + Return a returns index from a input price df or series. + """ + + ret_index = price_df.pct_change().add(1).cumprod() + ret_index.ix[0] = 1 + return ret_index + + +def get_components_yahoo(idx_sym): + """ + Returns DataFrame containing list of component information for + index represented in idx_sym from yahoo. Includes component symbol + (ticker), exchange, and name. + + Parameters + ---------- + idx_sym : str + Stock index symbol + Examples: + '^DJI' (Dow Jones Industrial Average) + '^NYA' (NYSE Composite) + '^IXIC' (NASDAQ Composite) + + See: http://finance.yahoo.com/indices for other index symbols + + Returns + ------- + idx_df : DataFrame + """ + stats = 'snx' + #URL of form: + #http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv + url = 'http://download.finance.yahoo.com/d/quotes.csv?s={0}&f={1}' \ + '&e=.csv&h={2}' + + idx_mod = idx_sym.replace('^', '@%5E') + urlStr = url.format(idx_mod, stats, 1) + + idx_df = DataFrame() + mask = [True] + comp_idx = 1 + + #LOOP across component index structure, + #break when no new components are found + while (True in mask): + urlStr = url.format(idx_mod, stats, comp_idx) + lines = (urllib.urlopen(urlStr).read().strip(). + strip('"').split('"\r\n"')) + + lines = [line.strip().split('","') for line in lines] + + temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange']) + temp_df = temp_df.drop_duplicates() + temp_df = temp_df.set_index('ticker') + mask = ~temp_df.index.isin(idx_df.index) + + comp_idx = comp_idx + 50 + idx_df = idx_df.append(temp_df[mask]) + + return idx_df + + +def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0, + adjust_price=False, ret_index=False, chunksize=25, **kwargs): + """ + Returns DataFrame/Panel of historical stock prices from symbols, over date + range, start to end. To avoid being penalized by Yahoo! Finance servers, + pauses between downloading 'chunks' of symbols can be specified. + + Parameters + ---------- + symbols : string, list-like object (list, tupel, Series), or DataFrame + Single stock symbol (ticker), list-like object of symbols or + DataFrame with index containing stock symbols. + start : string, (defaults to '1/1/2010') + Starting date, timestamp. Parses many different kind of date + representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') + end : string, (defaults to today) + Ending date, timestamp. Same format as starting date. + retry_count : int, default 3 + Number of times to retry query request. + pause : int, default 0 + Time, in seconds, to pause between consecutive queries of chunks. If + single value given for symbol, represents the pause between retries. + adjust_price : bool, default False + If True, adjusts all prices in hist_data ('Open', 'High', 'Low', 'Close') + based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops + 'Adj Close'. + ret_index : bool, default False + If True, includes a simple return index 'Ret_Index' in hist_data. + chunksize : int, default 25 + Number of symbols to download consecutively before intiating pause. + + Returns + ------- + hist_data : DataFrame (str) or Panel (list-like object, DataFrame) + """ + + def dl_mult_symbols(symbols): + stocks = {} + for sym_group in _in_chunks(symbols, chunksize): + for sym in sym_group: + try: + stocks[sym] = _get_hist_yahoo(sym, start=start, + end=end, **kwargs) + except: + warnings.warn('Error with sym: ' + sym + '... skipping.') + + time.sleep(pause) + + return Panel(stocks).swapaxes('items', 'minor') + + if 'name' in kwargs: + warnings.warn("Arg 'name' is deprecated, please use 'symbols' instead.", + FutureWarning) + symbols = kwargs['name'] + + #If a single symbol, (e.g., 'GOOG') + if isinstance(symbols, (str, int)): + sym = symbols + hist_data = _get_hist_yahoo(sym, start=start, end=end) + #Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT']) + elif isinstance(symbols, DataFrame): + try: + hist_data = dl_mult_symbols(Series(symbols.index)) + except ValueError: + raise + else: #Guess a Series + try: + hist_data = dl_mult_symbols(symbols) + except TypeError: + hist_data = dl_mult_symbols(Series(symbols)) + + if(ret_index): + hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close']) + if(adjust_price): + hist_data = _adjust_prices(hist_data) + + return hist_data + + def get_data_fred(name=None, start=dt.datetime(2010, 1, 1), end=dt.datetime.today()): """ diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py index 89c650316468c..1f25e3ccd165f 100644 --- a/pandas/io/tests/test_yahoo.py +++ b/pandas/io/tests/test_yahoo.py @@ -1,14 +1,16 @@ -from pandas.util.py3compat import StringIO, BytesIO -from datetime import datetime -import csv -import os -import sys -import re import unittest -import pandas.io.data as pd import nose -from pandas.util.testing import network +from datetime import datetime + +from pandas.util.py3compat import StringIO, BytesIO + +import pandas as pd +import pandas.io.data as web +from pandas.util.testing import (network, assert_frame_equal, + assert_series_equal, + assert_almost_equal) from numpy.testing.decorators import slow + import urllib2 @@ -21,16 +23,16 @@ def test_yahoo(self): # an excecption when DataReader can't get a 200 response from # yahoo start = datetime(2010, 1, 1) - end = datetime(2012, 1, 24) + end = datetime(2013, 01, 27) try: self.assertEquals( - pd.DataReader("F", 'yahoo', start, end)['Close'][-1], - 12.82) + web.DataReader("F", 'yahoo', start, end)['Close'][-1], + 13.68) self.assertRaises( Exception, - lambda: pd.DataReader("NON EXISTENT TICKER", 'yahoo', + lambda: web.DataReader("NON EXISTENT TICKER", 'yahoo', start, end)) except urllib2.URLError: try: @@ -40,7 +42,71 @@ def test_yahoo(self): else: raise + + @slow + @network + def test_get_quote(self): + df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG'])) + assert_series_equal(df.ix[0], df.ix[2]) + + + @slow + @network + def test_get_components(self): + + df = web.get_components_yahoo('^DJI') #Dow Jones + assert isinstance(df, pd.DataFrame) + assert len(df) == 30 + + df = web.get_components_yahoo('^GDAXI') #DAX + assert isinstance(df, pd.DataFrame) + assert len(df) == 30 + assert df[df.name.str.contains('adidas', case=False)].index == 'ADS.DE' + + df = web.get_components_yahoo('^NDX') #NASDAQ-100 + assert isinstance(df, pd.DataFrame) + #assert len(df) == 100 + #Usual culprits, should be around for a while + assert 'AAPL' in df.index + assert 'GOOG' in df.index + assert 'AMZN' in df.index + + @slow + @network + def test_get_data(self): + #single symbol + #http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d + df = web.get_data_yahoo('GOOG') + assert df.Volume.ix['OCT-08-2010'] == 2859200 + + sl = ['AAPL', 'AMZN', 'GOOG'] + pan = web.get_data_yahoo(sl, '2012') + ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG] + assert ts[0].dayofyear == 96 + + dfi = web.get_components_yahoo('^DJI') + pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-12') + expected = [19.02, 28.23, 25.39] + result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist() + assert result == expected + + pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-12', + adjust_price=True) + expected = [18.38, 27.45, 24.54] + result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist() + assert result == expected + + pan = web.get_data_yahoo(dfi, '2011', ret_index=True) + d = [[ 1.01757469, 1.01130524, 1.02414183], + [ 1.00292912, 1.00770812, 1.01735194], + [ 1.00820152, 1.00462487, 1.01320257], + [ 1.08025776, 0.99845838, 1.00113165]] + + expected = pd.DataFrame(d) + result = pan.Ret_Index.ix['01-18-11':'01-21-11'][['GE', 'INTC', 'MSFT']] + assert_almost_equal(result.values, expected.values) + + if __name__ == '__main__': - import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)