Finding structure in the stock marketΒΆ
An example of playing with stock market data to try and find some structure in it.
Python source code: stock_market.py
# Author: Gael Varoquaux gael.varoquaux@normalesup.org
# License: BSD
import datetime
from matplotlib import finance
import numpy as np
from scikits.learn import cluster
# Choose a time period reasonnably calm (not too long ago so that we get
# high-tech firms, and before the 2008 crash)
d1 = datetime.datetime(2003, 01, 01)
d2 = datetime.datetime(2008, 01, 01)
symbol_dict = {
'TOT' : 'Total',
'XOM' : 'Exxon',
'CVX' : 'Chevron',
'COP' : 'ConocoPhillips',
'VLO' : 'Valero Energy',
'MSFT' : 'Microsoft',
'IBM' : 'IBM',
'TWX' : 'Time Warner',
'CMCSA': 'Comcast',
'CVC' : 'Cablevision',
'YHOO' : 'Yahoo',
'DELL' : 'Dell',
'HPQ' : 'Hewlett-Packard',
'AMZN' : 'Amazon',
'TM' : 'Toyota',
'CAJ' : 'Canon',
'MTU' : 'Mitsubishi',
'SNE' : 'Sony',
'F' : 'Ford',
'HMC' : 'Honda',
'NAV' : 'Navistar',
'NOC' : 'Northrop Grumman',
'BA' : 'Boeing',
'KO' : 'Coca Cola',
'MMM' : '3M',
'MCD' : 'Mc Donalds',
'PEP' : 'Pepsi',
'KFT' : 'Kraft Foods',
'K' : 'Kellogg',
'UN' : 'Unilever',
'MAR' : 'Marriott',
'PG' : 'Procter Gamble',
'CL' : 'Colgate-Palmolive',
'NWS' : 'News Corporation',
'GE' : 'General Electrics',
'WFC' : 'Wells Fargo',
'JPM' : 'JPMorgan Chase',
'AIG' : 'AIG',
'AXP' : 'American express',
'BAC' : 'Bank of America',
'GS' : 'Goldman Sachs',
'AAPL' : 'Apple',
'SAP' : 'SAP',
'CSCO' : 'Cisco',
'TXN' : 'Texas instruments',
'XRX' : 'Xerox',
'LMT' : 'Lookheed Martin',
'WMT' : 'Wal-Mart',
'WAG' : 'Walgreen',
'HD' : 'Home Depot',
'GSK' : 'GlaxoSmithKline',
'PFE' : 'Pfizer',
'SNY' : 'Sanofi-Aventis',
'NVS' : 'Novartis',
'KMB' : 'Kimberly-Clark',
'R' : 'Ryder',
'GD' : 'General Dynamics',
'RTN' : 'Raytheon',
'CVS' : 'CVS',
'CAT' : 'Caterpillar',
'DD' : 'DuPont de Nemours',
}
symbols, names = np.array(symbol_dict.items()).T
quotes = [finance.quotes_historical_yahoo(symbol, d1, d2, asobject=True)
for symbol in symbols]
#volumes = np.array([q.volume for q in quotes]).astype(np.float)
open = np.array([q.open for q in quotes]).astype(np.float)
close = np.array([q.close for q in quotes]).astype(np.float)
variation = close - open
correlations = np.corrcoef(variation)
_, labels = cluster.affinity_propagation(correlations)
for i in range(labels.max()+1):
print 'Cluster %i: %s' % ((i+1),
', '.join(names[labels==i]))