From 9a48b4527670ff56b8529c01aa961873bbd610d5 Mon Sep 17 00:00:00 2001
From: "Nicholaus E. Halecky" <nehalecky@gmail.com>
Date: Mon, 28 Jan 2013 16:30:04 -0800
Subject: [PATCH 1/3] EHN: Expand Yahoo finance features, idx components

---
 pandas/io/data.py | 182 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 173 insertions(+), 9 deletions(-)

diff --git a/pandas/io/data.py b/pandas/io/data.py
index e4457d141e92c..964a58739b7e6 100644
--- a/pandas/io/data.py
+++ b/pandas/io/data.py
@@ -3,6 +3,7 @@
 
 
 """
+import warnings
 
 import numpy as np
 import datetime as dt
@@ -13,7 +14,7 @@
 from zipfile import ZipFile
 from pandas.util.py3compat import StringIO, BytesIO, bytes_to_str
 
-from pandas import DataFrame, read_csv, concat
+from pandas import Panel, DataFrame, Series, read_csv, concat
 from pandas.io.parsers import TextParser
 
 
@@ -54,7 +55,8 @@ def DataReader(name, data_source=None, start=None, end=None,
     start, end = _sanitize_dates(start, end)
 
     if(data_source == "yahoo"):
-        return get_data_yahoo(name=name, start=start, end=end,
+        return get_data_yahoo(symbols=name, start=start, end=end,
+                              adjust_price=False, chunk=25,
                               retry_count=retry_count, pause=pause)
     elif(data_source == "fred"):
         return get_data_fred(name=name, start=start, end=end)
@@ -73,14 +75,27 @@ def _sanitize_dates(start, end):
     return start, end
 
 
+def _in_chunks(seq, size):
+    """
+    Return sequence in 'chunks' of size defined by size
+    """
+    return (seq[pos:pos + size] for pos in xrange(0, len(seq), size))
+
+
 def get_quote_yahoo(symbols):
     """
     Get current yahoo quote
 
     Returns a DataFrame
     """
-    if not isinstance(symbols, list):
-        raise TypeError("symbols must be a list")
+    if isinstance(symbols, str):
+        sym_list = symbols
+    elif not isinstance(symbols, Series):
+        symbols  = Series(symbols)
+        sym_list = str.join('+', symbols)
+    else:
+        sym_list = str.join('+', symbols)
+
     # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm
     codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r',
              'time': 't1', 'short_ratio': 's7'}
@@ -90,7 +105,7 @@ def get_quote_yahoo(symbols):
     data = dict(zip(codes.keys(), [[] for i in range(len(codes))]))
 
     urlStr = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % (
-        str.join('+', symbols), request)
+        sym_list, request)
 
     try:
         lines = urllib2.urlopen(urlStr).readlines()
@@ -117,19 +132,20 @@ def get_quote_yahoo(symbols):
     return DataFrame(data, index=idx)
 
 
-def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0):
+def _get_hist_yahoo(name=None, start=None, end=None, retry_count=3,
+                    pause=0):
     """
     Get historical data for the given name from yahoo.
     Date format is datetime
 
     Returns a DataFrame.
     """
-    start, end = _sanitize_dates(start, end)
-
     if(name is None):
-        print "Need to provide a name"
+        warnings.warn("Need to provide a name.")
         return None
 
+    start, end = _sanitize_dates(start, end)
+
     yahoo_URL = 'http://ichart.yahoo.com/table.csv?'
 
     url = yahoo_URL + 's=%s' % name + \
@@ -162,6 +178,154 @@ def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0):
                     "return a 200 for url %s" % (pause, url))
 
 
+def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']):
+    """
+    Return modifed DataFrame or Panel with adjusted prices based on
+    'Adj Close' price. Adds 'Adj_Ratio' column.
+    """
+    adj_ratio = hist_data['Adj Close'] / hist_data['Close']
+
+    data = hist_data.copy()
+    for item in price_list:
+        data[item] = hist_data[item] * adj_ratio
+    data['Adj_Ratio'] = adj_ratio
+    del data['Adj Close']
+    return data
+
+
+def _calc_return_index(price_df):
+    """
+    Return a returns index from a input price df or series.
+    """
+
+    ret_index =  price_df.pct_change().add(1).cumprod()
+    ret_index.ix[0] = 1
+    return ret_index
+
+
+def get_components_yahoo(idx_sym='^DJI'):
+    """
+    Returns DataFrame containing list of component information for index
+    represented in idx_sym from yahoo. Includes component symbol
+    (ticker), exchange, and name.
+
+    Parameters
+    ----------
+    idx_sym : str
+        Index symbol, default '^DJI' (Dow Jones Industrial Average)
+        Examples:
+        '^NYA' (NYSE Composite)
+        '^IXIC' (NASDAQ Composite)
+
+        See: http://finance.yahoo.com/indices for other index symbols
+
+    Returns
+    -------
+    idx_df : DataFrame
+    """
+    stats = 'snx'
+    #URL of form:
+    #http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv
+    url = 'http://download.finance.yahoo.com/d/quotes.csv?s={0}&f={1}' \
+          '&e=.csv&h={2}'
+
+    idx_mod = idx_sym.replace('^', '@%5E')
+    urlStr = url.format(idx_mod, stats, 1)
+
+    idx_df = DataFrame()
+    mask = [True]
+    comp_idx = 1
+
+    #LOOP across component index structure,
+    #break when no new components are found
+    while (True in mask):
+        urlStr = url.format(idx_mod, stats,  comp_idx)
+        lines = (urllib.urlopen(urlStr).read().strip().
+                 strip('"').split('"\r\n"'))
+
+        lines = [line.strip().split('","') for line in lines]
+
+        temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange'])
+        temp_df = temp_df.drop_duplicates()
+        temp_df = temp_df.set_index('ticker')
+        mask = ~temp_df.index.isin(idx_df.index)
+
+        comp_idx = comp_idx + 50
+        idx_df = idx_df.append(temp_df[mask])
+
+    return idx_df
+
+
+def get_data_yahoo(symbols=None, start=None, end=None, adjust_price=False,
+                   ret_index=False, chunk=25, pause=0, **kwargs):
+    """
+    Returns DataFrame/Panel of historical stock prices from symbols, over date
+    range, start to end. To avoid being penalized by Yahoo! Finance servers,
+    pauses between downloading 'chunks' of symbols can be specified.
+
+    Parameters
+    ----------
+    symbols : string, list-like object (list, tupel, Series), DataFrame
+        Single stock symbol (ticker), list-like object of symbols or
+        DataFrame with index containing of stock symbols
+    start : string, (defaults to '1/1/2010')
+        Starting date, timestamp. Parses many different kind of date
+        representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
+    end :  string, (defaults to today)
+        Ending date, timestamp. Same format as starting date.
+    adjust_price : bool, default False
+        Adjust all prices in hist_data ('Open', 'High', 'Low', 'Close') via
+        'Adj Close' price. Adds 'Adj_Ratio' column and drops 'Adj Close'.
+    ret_index: bool, default False
+        Include a simple return index 'Ret_Index' in hist_data.
+    chunk : int, default 25
+        Number of symbols to download consecutively before intiating pause.
+    pause : int, default 0
+        Time, in seconds, to pause between consecutive chunks.
+    **kwargs: additional arguments to pass to _get_hist_yahoo
+
+    Returns
+    -------
+    hist_data : DataFrame (str) or Panel (list-like object, DataFrame)
+    """
+    def dl_mult_symbols(symbols):
+        stocks = {}
+        for sym_group in _in_chunks(symbols, chunk):
+            for sym in sym_group:
+                try:
+                    stocks[sym] = _get_hist_yahoo(name=sym, start=start,
+                                                  end=end, **kwargs)
+                except:
+                    warnings.warn('Error with sym: ' + sym + '... skipping.')
+
+            time.sleep(pause)
+
+        return Panel(stocks).swapaxes('items', 'minor')
+
+    #If a scalar (single symbol, e.g. 'GOOG')
+    if isinstance(symbols, (str, int)):
+        sym = symbols
+        hist_data = _get_hist_yahoo(sym, start=start, end=end, **kwargs)
+    #Multiple symbols
+    elif isinstance(symbols, DataFrame):
+        try:
+            hist_data = dl_mult_symbols(Series(symbols.index))
+        except ValueError:
+            raise
+    else: #Guess a Series
+        try:
+            hist_data = dl_mult_symbols(symbols)
+        except TypeError:
+            hist_data = dl_mult_symbols(Series(symbols))
+
+    if(ret_index):
+        hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close'])
+    if(adjust_price):
+        hist_data = _adjust_prices(hist_data)
+
+    return hist_data
+
+
 def get_data_fred(name=None, start=dt.datetime(2010, 1, 1),
                   end=dt.datetime.today()):
     """

From 9b59f92fd4427e263eaaec6c9a245af06d0451d5 Mon Sep 17 00:00:00 2001
From: "Nicholaus E. Halecky" <nehalecky@gmail.com>
Date: Sat, 2 Feb 2013 14:45:29 -0800
Subject: [PATCH 2/3] TST: Expanded test coverage of yahoo finance funcs

---
 pandas/io/tests/test_yahoo.py | 90 ++++++++++++++++++++++++++++++-----
 1 file changed, 77 insertions(+), 13 deletions(-)

diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py
index 89c650316468c..f705eddff5f56 100644
--- a/pandas/io/tests/test_yahoo.py
+++ b/pandas/io/tests/test_yahoo.py
@@ -1,14 +1,16 @@
-from pandas.util.py3compat import StringIO, BytesIO
-from datetime import datetime
-import csv
-import os
-import sys
-import re
 import unittest
-import pandas.io.data as pd
 import nose
-from pandas.util.testing import network
+from datetime import datetime
+
+from pandas.util.py3compat import StringIO, BytesIO
+
+import pandas as pd
+import pandas.io.data as web
+from pandas.util.testing import (network, assert_frame_equal,
+                                 assert_series_equal,
+                                 assert_almost_equal)
 from numpy.testing.decorators import slow
+
 import urllib2
 
 
@@ -21,16 +23,16 @@ def test_yahoo(self):
         # an excecption when DataReader can't get a 200 response from
         # yahoo
         start = datetime(2010, 1, 1)
-        end = datetime(2012, 1, 24)
+        end = datetime(2013, 01, 27)
 
         try:
             self.assertEquals(
-                pd.DataReader("F", 'yahoo', start, end)['Close'][-1],
-                12.82)
+                web.DataReader("F", 'yahoo', start, end)['Close'][-1],
+                13.68)
 
             self.assertRaises(
                 Exception,
-                lambda: pd.DataReader("NON EXISTENT TICKER", 'yahoo',
+                lambda: web.DataReader("NON EXISTENT TICKER", 'yahoo',
                                       start, end))
         except urllib2.URLError:
             try:
@@ -40,7 +42,69 @@ def test_yahoo(self):
             else:
                 raise
 
+    @slow
+    @network
+    def test_get_quote(self):
+        df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG']))
+        assert_series_equal(df.ix[0], df.ix[2])
+
+    @slow
+    @network
+    def test_get_components(self):
+
+        df = web.get_components_yahoo() #Dow Jones (default)
+        assert isinstance(df, pd.DataFrame)
+        assert len(df) == 30
+
+        df = web.get_components_yahoo('^GDAXI') #DAX
+        assert isinstance(df, pd.DataFrame)
+        assert len(df) == 30
+        assert df[df.name.str.contains('adidas', case=False)].index == 'ADS.DE'
+
+        df = web.get_components_yahoo('^NDX') #NASDAQ-100
+        assert isinstance(df, pd.DataFrame)
+        assert len(df) == 100
+        #Usual culprits, should be around for a while
+        assert 'AAPL' in df.index
+        assert 'GOOG' in df.index
+        assert 'AMZN' in df.index
+
+    @slow
+    @network
+    def test_get_data(self):
+        #single symbol
+        #http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d
+        df = web.get_data_yahoo('GOOG')
+        assert df.Volume.ix['OCT-08-2010'] == 2859200
+
+        sl = ['AAPL', 'AMZN', 'GOOG']
+        pan = web.get_data_yahoo(sl, '2012')
+        ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG]
+        assert ts[0].dayofyear == 96
+
+        dfi = web.get_components_yahoo('^DJI')
+        pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-13')
+        expected = [19.02, 28.23, 25.39]
+        result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
+        assert result == expected
+
+        pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-13',
+                                 adjust_price=True)
+        expected = [18.38, 27.45, 24.54]
+        result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
+        assert result == expected
+
+        pan = web.get_data_yahoo(dfi, '2011', ret_index=True)
+        d = [[ 1.31810193,  1.08170606,  1.05281026],
+             [ 1.31810193,  1.09352518,  1.05658242],
+             [ 1.30228471,  1.09815005,  1.05054696],
+             [ 1.30521383,  1.08119219,  1.03545832]]
+
+        expected = pd.DataFrame(d)
+        result = pan.Ret_Index[['GE', 'INTC', 'MSFT']].ix[-5:-1]
+        assert_almost_equal(result.values, expected.values)
+
+
 if __name__ == '__main__':
-    import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)

From 770810461df6ee6fd02bc49618e228ca5f483e72 Mon Sep 17 00:00:00 2001
From: "Nicholaus E. Halecky" <nehalecky@gmail.com>
Date: Tue, 5 Feb 2013 00:33:30 -0800
Subject: [PATCH 3/3] BUG: Fix backwards compatibility in get_data_yahoo

---
 pandas/io/data.py             | 60 ++++++++++++++++++++---------------
 pandas/io/tests/test_yahoo.py | 20 ++++++------
 2 files changed, 46 insertions(+), 34 deletions(-)

diff --git a/pandas/io/data.py b/pandas/io/data.py
index 964a58739b7e6..2c08838596196 100644
--- a/pandas/io/data.py
+++ b/pandas/io/data.py
@@ -132,7 +132,7 @@ def get_quote_yahoo(symbols):
     return DataFrame(data, index=idx)
 
 
-def _get_hist_yahoo(name=None, start=None, end=None, retry_count=3,
+def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3,
                     pause=0):
     """
     Get historical data for the given name from yahoo.
@@ -140,7 +140,7 @@ def _get_hist_yahoo(name=None, start=None, end=None, retry_count=3,
 
     Returns a DataFrame.
     """
-    if(name is None):
+    if(sym is None):
         warnings.warn("Need to provide a name.")
         return None
 
@@ -148,7 +148,7 @@ def _get_hist_yahoo(name=None, start=None, end=None, retry_count=3,
 
     yahoo_URL = 'http://ichart.yahoo.com/table.csv?'
 
-    url = yahoo_URL + 's=%s' % name + \
+    url = yahoo_URL + 's=%s' % sym + \
         '&a=%s' % (start.month - 1) + \
         '&b=%s' % start.day + \
         '&c=%s' % start.year + \
@@ -203,17 +203,18 @@ def _calc_return_index(price_df):
     return ret_index
 
 
-def get_components_yahoo(idx_sym='^DJI'):
+def get_components_yahoo(idx_sym):
     """
-    Returns DataFrame containing list of component information for index
-    represented in idx_sym from yahoo. Includes component symbol
+    Returns DataFrame containing list of component information for
+    index represented in idx_sym from yahoo. Includes component symbol
     (ticker), exchange, and name.
 
     Parameters
     ----------
     idx_sym : str
-        Index symbol, default '^DJI' (Dow Jones Industrial Average)
+        Stock index symbol
         Examples:
+        '^DJI' (Dow Jones Industrial Average)
         '^NYA' (NYSE Composite)
         '^IXIC' (NASDAQ Composite)
 
@@ -256,8 +257,8 @@ def get_components_yahoo(idx_sym='^DJI'):
     return idx_df
 
 
-def get_data_yahoo(symbols=None, start=None, end=None, adjust_price=False,
-                   ret_index=False, chunk=25, pause=0, **kwargs):
+def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0,
+                   adjust_price=False, ret_index=False, chunksize=25, **kwargs):
     """
     Returns DataFrame/Panel of historical stock prices from symbols, over date
     range, start to end. To avoid being penalized by Yahoo! Finance servers,
@@ -265,35 +266,39 @@ def get_data_yahoo(symbols=None, start=None, end=None, adjust_price=False,
 
     Parameters
     ----------
-    symbols : string, list-like object (list, tupel, Series), DataFrame
+    symbols : string, list-like object (list, tupel, Series), or DataFrame
         Single stock symbol (ticker), list-like object of symbols or
-        DataFrame with index containing of stock symbols
+        DataFrame with index containing stock symbols.
     start : string, (defaults to '1/1/2010')
         Starting date, timestamp. Parses many different kind of date
         representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
-    end :  string, (defaults to today)
+    end : string, (defaults to today)
         Ending date, timestamp. Same format as starting date.
+    retry_count : int, default 3
+        Number of times to retry query request.
+    pause : int, default 0
+        Time, in seconds, to pause between consecutive queries of chunks. If
+        single value given for symbol, represents the pause between retries.
     adjust_price : bool, default False
-        Adjust all prices in hist_data ('Open', 'High', 'Low', 'Close') via
-        'Adj Close' price. Adds 'Adj_Ratio' column and drops 'Adj Close'.
-    ret_index: bool, default False
-        Include a simple return index 'Ret_Index' in hist_data.
-    chunk : int, default 25
+        If True, adjusts all prices in hist_data ('Open', 'High', 'Low', 'Close')
+        based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops
+        'Adj Close'.
+    ret_index : bool, default False
+        If True, includes a simple return index 'Ret_Index' in hist_data.
+    chunksize : int, default 25
         Number of symbols to download consecutively before intiating pause.
-    pause : int, default 0
-        Time, in seconds, to pause between consecutive chunks.
-    **kwargs: additional arguments to pass to _get_hist_yahoo
 
     Returns
     -------
     hist_data : DataFrame (str) or Panel (list-like object, DataFrame)
     """
+
     def dl_mult_symbols(symbols):
         stocks = {}
-        for sym_group in _in_chunks(symbols, chunk):
+        for sym_group in _in_chunks(symbols, chunksize):
             for sym in sym_group:
                 try:
-                    stocks[sym] = _get_hist_yahoo(name=sym, start=start,
+                    stocks[sym] = _get_hist_yahoo(sym, start=start,
                                                   end=end, **kwargs)
                 except:
                     warnings.warn('Error with sym: ' + sym + '... skipping.')
@@ -302,11 +307,16 @@ def dl_mult_symbols(symbols):
 
         return Panel(stocks).swapaxes('items', 'minor')
 
-    #If a scalar (single symbol, e.g. 'GOOG')
+    if 'name' in kwargs:
+        warnings.warn("Arg 'name' is deprecated, please use 'symbols' instead.",
+                      FutureWarning)
+        symbols = kwargs['name']
+
+    #If a single symbol, (e.g., 'GOOG')
     if isinstance(symbols, (str, int)):
         sym = symbols
-        hist_data = _get_hist_yahoo(sym, start=start, end=end, **kwargs)
-    #Multiple symbols
+        hist_data = _get_hist_yahoo(sym, start=start, end=end)
+    #Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
     elif isinstance(symbols, DataFrame):
         try:
             hist_data = dl_mult_symbols(Series(symbols.index))
diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py
index f705eddff5f56..1f25e3ccd165f 100644
--- a/pandas/io/tests/test_yahoo.py
+++ b/pandas/io/tests/test_yahoo.py
@@ -42,17 +42,19 @@ def test_yahoo(self):
             else:
                 raise
 
+
     @slow
     @network
     def test_get_quote(self):
         df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG']))
         assert_series_equal(df.ix[0], df.ix[2])
 
+
     @slow
     @network
     def test_get_components(self):
 
-        df = web.get_components_yahoo() #Dow Jones (default)
+        df = web.get_components_yahoo('^DJI') #Dow Jones
         assert isinstance(df, pd.DataFrame)
         assert len(df) == 30
 
@@ -63,7 +65,7 @@ def test_get_components(self):
 
         df = web.get_components_yahoo('^NDX') #NASDAQ-100
         assert isinstance(df, pd.DataFrame)
-        assert len(df) == 100
+        #assert len(df) == 100
         #Usual culprits, should be around for a while
         assert 'AAPL' in df.index
         assert 'GOOG' in df.index
@@ -83,25 +85,25 @@ def test_get_data(self):
         assert ts[0].dayofyear == 96
 
         dfi = web.get_components_yahoo('^DJI')
-        pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-13')
+        pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-12')
         expected = [19.02, 28.23, 25.39]
         result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
         assert result == expected
 
-        pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-13',
+        pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-12',
                                  adjust_price=True)
         expected = [18.38, 27.45, 24.54]
         result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
         assert result == expected
 
         pan = web.get_data_yahoo(dfi, '2011', ret_index=True)
-        d = [[ 1.31810193,  1.08170606,  1.05281026],
-             [ 1.31810193,  1.09352518,  1.05658242],
-             [ 1.30228471,  1.09815005,  1.05054696],
-             [ 1.30521383,  1.08119219,  1.03545832]]
+        d = [[ 1.01757469,  1.01130524,  1.02414183],
+             [ 1.00292912,  1.00770812,  1.01735194],
+             [ 1.00820152,  1.00462487,  1.01320257],
+             [ 1.08025776,  0.99845838,  1.00113165]]
 
         expected = pd.DataFrame(d)
-        result = pan.Ret_Index[['GE', 'INTC', 'MSFT']].ix[-5:-1]
+        result = pan.Ret_Index.ix['01-18-11':'01-21-11'][['GE', 'INTC', 'MSFT']]
         assert_almost_equal(result.values, expected.values)