Machine learning against S&P 500 company prices - Python Programming for Finance p.12




Hello and welcome to part 12 of the Python for Finance tutorial series. In the previous tutorial, we covered how to take our data and create featuresets and labels out of it, which we can then feed through a machine learning algorithm with the hope that it will learn to map relationships of existing price changes to future price changes for a company.

Before we begin, our starting code up to this point:

import bs4 as bs
import datetime as dt
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import os
import pandas as pd
import pandas_datareader.data as web
import pickle
import requests
from collections import Counter

style.use('ggplot')


def save_sp500_tickers():
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        tickers.append(ticker)
    with open("sp500tickers.pickle", "wb") as f:
        pickle.dump(tickers, f)
    return tickers


# save_sp500_tickers()
def get_data_from_yahoo(reload_sp500=False):
    if reload_sp500:
        tickers = save_sp500_tickers()
    else:
        with open("sp500tickers.pickle", "rb") as f:
            tickers = pickle.load(f)
    if not os.path.exists('stock_dfs'):
        os.makedirs('stock_dfs')

    start = dt.datetime(2010, 1, 1)
    end = dt.datetime.now()
    for ticker in tickers:
        # just in case your connection breaks, we'd like to save our progress!
        if not os.path.exists('stock_dfs/{}.csv'.format(ticker)):
            df = web.DataReader(ticker, 'morningstar', start, end)
            df.reset_index(inplace=True)
            df.set_index("Date", inplace=True)
            df = df.drop("Symbol", axis=1)
            df.to_csv('stock_dfs/{}.csv'.format(ticker))
        else:
            print('Already have {}'.format(ticker))


def compile_data():
    with open("sp500tickers.pickle", "rb") as f:
        tickers = pickle.load(f)

    main_df = pd.DataFrame()

    for count, ticker in enumerate(tickers):
        df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
        df.set_index('Date', inplace=True)

        df.rename(columns={'Adj Close': ticker}, inplace=True)
        df.drop(['Open', 'High', 'Low', 'Close', 'Volume'], 1, inplace=True)

        if main_df.empty:
            main_df = df
        else:
            main_df = main_df.join(df, how='outer')

        if count % 10 == 0:
            print(count)
    print(main_df.head())
    main_df.to_csv('sp500_joined_closes.csv')


def visualize_data():
    df = pd.read_csv('sp500_joined_closes.csv')
    df_corr = df.corr()
    print(df_corr.head())
    df_corr.to_csv('sp500corr.csv')
    data1 = df_corr.values
    fig1 = plt.figure()
    ax1 = fig1.add_subplot(111)

    heatmap1 = ax1.pcolor(data1, cmap=plt.cm.RdYlGn)
    fig1.colorbar(heatmap1)

    ax1.set_xticks(np.arange(data1.shape[1]) + 0.5, minor=False)
    ax1.set_yticks(np.arange(data1.shape[0]) + 0.5, minor=False)
    ax1.invert_yaxis()
    ax1.xaxis.tick_top()
    column_labels = df_corr.columns
    row_labels = df_corr.index
    ax1.set_xticklabels(column_labels)
    ax1.set_yticklabels(row_labels)
    plt.xticks(rotation=90)
    heatmap1.set_clim(-1, 1)
    plt.tight_layout()
    plt.show()


def process_data_for_labels(ticker):
    hm_days = 7
    df = pd.read_csv('sp500_joined_closes.csv', index_col=0)
    tickers = df.columns.values.tolist()
    df.fillna(0, inplace=True)
    for i in range(1, hm_days+1):
        df['{}_{}d'.format(ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
    df.fillna(0, inplace=True)
    return tickers, df


def buy_sell_hold(*args):
    cols = [c for c in args]
    requirement = 0.02
    for col in cols:
        if col > requirement:
            return 1
        if col < -requirement:
            return -1
    return 0


def extract_featuresets(ticker):
    tickers, df = process_data_for_labels(ticker)

    df['{}_target'.format(ticker)] = list(map( buy_sell_hold,
                                               df['{}_1d'.format(ticker)],
                                               df['{}_2d'.format(ticker)],
                                               df['{}_3d'.format(ticker)],
                                               df['{}_4d'.format(ticker)],
                                               df['{}_5d'.format(ticker)],
                                               df['{}_6d'.format(ticker)],
                                               df['{}_7d'.format(ticker)]))

    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print('Data spread:', Counter(str_vals))

    df.fillna(0, inplace=True)
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)

    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], 0)
    df_vals.fillna(0, inplace=True)

    X = df_vals.values
    y = df['{}_target'.format(ticker)].values
    return X, y, df

We're going to add the following imports:

from sklearn import svm, cross_validation, neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

Sklearn is a machine learning framework. If you don't have it, make sure you download it: pip install scikit-learn. The svm import is for a Support Vector Machine, cross_validation will let us easily create shuffled training and testing samples, and neighbors is for K Nearest Neighbors. Then, we're bringing in the VotingClassifier and RandomForestClassifier. The voting classifier is just what it sounds like. Basically, it's a classifier that will let us combine many classifiers, and allow them to each get a "vote" on what they think the class of the featuresets is. The random forest classifier is just another classifier. We're going to use three classifiers in our voting classifier.

We're ready to do some machine learning now, so let's start our function:

def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)

We've got our featuresets and labels, now we want to shuffle them up, train, and then test:

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,
                                                        y,
                                                        test_size=0.25)

What this does for us is shuffle our data (so its not in any specific order any more), and then create training and testing samples for us. We don't want to "test" this algorithm on the same data we trained against. If we did that, chances are we'd do a lot better than we would in reality. We want to test the algorithm on data that it's never seen before to see if we've actually got a model that works.

Now we can choose from any of the classifiers we want, for now, let's do one for K Nearest Neighbors:

    clf = neighbors.KNeighborsClassifier()

Now we can fit(train) the classifier on our data:

    clf.fit(X_train, y_train)

This line will take our X data, and fit to our y data, for each of the pairs of X's and y's that we have. Once that's done, we can test it:

    confidence = clf.score(X_test, y_test)

This will take some featuresets, X_test, make a prediction, and see if it matches our labels, y_test. It will return to us the percentage accuracy in decimal form, where 1.0 is 100%, and 0.1 is 10% accurate. Now we can output some further useful information:

    print('accuracy:',confidence)
    predictions = clf.predict(X_test)
    print('predicted class counts:',Counter(predictions))
    print()
    print()

This will tell us what the accuracy was, then we can get the precitions of the X_testdata, and then output the distribution (using Counter), so we can see if our model is only classifying one class, which is something that can easily happen.

If this model is indeed successful, we can save it with pickle, and load it at any time to feed it some featuresets and get a prediction out of it, with clf.predict, which will predict either a single value from a single featureset, or a list of values from a list of featuresets.

Alright, we're ready for the moment of truth! What is our goal? Well, something that picks randomly should be about 33% accurate, since we have three total choices in theory, but actually it isn't likely that our model will be truly balanced. Let's see some examples, and just run:

do_ml('XOM')
do_ml('AAPL')
do_ml('ABT') 

Output:

Data spread: Counter({'1': 1713, '-1': 1456, '0': 1108})
accuracy: 0.375700934579
predicted class counts: Counter({0: 404, -1: 393, 1: 273})


Data spread: Counter({'1': 2098, '-1': 1830, '0': 349})
accuracy: 0.4
predicted class counts: Counter({-1: 644, 1: 339, 0: 87})


Data spread: Counter({'1': 1690, '-1': 1483, '0': 1104})
accuracy: 0.33738317757
predicted class counts: Counter({-1: 383, 0: 372, 1: 315})

So all of these are better than 33%, but the training data wasn't perfectly balanced either. For example, we can look at the first one:

Data spread: Counter({'1': 1713, '-1': 1456, '0': 1108})
accuracy: 0.375700934579
predicted class counts: Counter({0: 404, -1: 393, 1: 273})

In this case, what if the model ONLY predicted "buy?" That would have been 1,713 correct / 4,277, which is actually a better score than we got. What about the other two? The second one, AAPL, is 49% accurate if it just predicts buy, at least on the training data. ABT is 37% accurate if it just does buy on the training data.

So, while we're doing better than 33%, it's currently unclear if this model is better than just saying "buy" on everything. In actual trading, this all can change. This model is being penalized, for example, if it says something is a buy, expecting a 2% rise in 7 days, but that 2% rise doesn't happen until 8 days, and yet, the algorithm calls it either a buy or hold along the way. In actual trading, this would still be fine. The same is true if this model turned out to be highly accurate. Actually trading a model can be a completely different thing entirely.

Next, let's try that voting classifer. So, rather than clf = neighbors.KNeighborsClassifier(), we do:

    clf = VotingClassifier([('lsvc',svm.LinearSVC()),
                            ('knn',neighbors.KNeighborsClassifier()),
                            ('rfor',RandomForestClassifier())])

New output:

Data spread: Counter({'1': 1713, '-1': 1456, '0': 1108})
accuracy: 0.379439252336
predicted class counts: Counter({-1: 487, 1: 417, 0: 166})


Data spread: Counter({'1': 2098, '-1': 1830, '0': 349})
accuracy: 0.471028037383
predicted class counts: Counter({1: 616, -1: 452, 0: 2})


Data spread: Counter({'1': 1690, '-1': 1483, '0': 1104})
accuracy: 0.378504672897
predicted class counts: Counter({-1: 524, 1: 394, 0: 152})

Across the board, we have improvement! That's good to see. We're also notably using defaults on all of the algorithms. Each of these algorithms have quite a few parameters that we could spend a while tweaking to eek out a bit more performance and likely beat at the very least the odds of simply predicting "buy" on everything. That said, machine learning is a massive topic and it would take me months to go through everything. If you want to learn more about the algorithms yourself so you can tweak them, check out the Machine Learning tutorial series. We cover a bunch of machine learning algorithms, how they work fundamentally, how to apply them, then how to make them ourselves in raw Python. By the time you get through that entire series, you should be very well equipped to wrangle all kinds of challenges with machine learning.

Full code up to this point:

import bs4 as bs
import datetime as dt
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import os
import pandas as pd
import pandas_datareader.data as web
import pickle
import requests
from collections import Counter
from sklearn import svm, cross_validation, neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier


style.use('ggplot')


def save_sp500_tickers():
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        tickers.append(ticker)
    with open("sp500tickers.pickle", "wb") as f:
        pickle.dump(tickers, f)
    return tickers


# save_sp500_tickers()
def get_data_from_yahoo(reload_sp500=False):
    if reload_sp500:
        tickers = save_sp500_tickers()
    else:
        with open("sp500tickers.pickle", "rb") as f:
            tickers = pickle.load(f)
    if not os.path.exists('stock_dfs'):
        os.makedirs('stock_dfs')

    start = dt.datetime(2010, 1, 1)
    end = dt.datetime.now()
    for ticker in tickers:
        # just in case your connection breaks, we'd like to save our progress!
        if not os.path.exists('stock_dfs/{}.csv'.format(ticker)):
            df = web.DataReader(ticker, 'morningstar', start, end)
            df.reset_index(inplace=True)
            df.set_index("Date", inplace=True)
            df = df.drop("Symbol", axis=1)
            df.to_csv('stock_dfs/{}.csv'.format(ticker))
        else:
            print('Already have {}'.format(ticker))


def compile_data():
    with open("sp500tickers.pickle", "rb") as f:
        tickers = pickle.load(f)

    main_df = pd.DataFrame()

    for count, ticker in enumerate(tickers):
        df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
        df.set_index('Date', inplace=True)

        df.rename(columns={'Adj Close': ticker}, inplace=True)
        df.drop(['Open', 'High', 'Low', 'Close', 'Volume'], 1, inplace=True)

        if main_df.empty:
            main_df = df
        else:
            main_df = main_df.join(df, how='outer')

        if count % 10 == 0:
            print(count)
    print(main_df.head())
    main_df.to_csv('sp500_joined_closes.csv')


def visualize_data():
    df = pd.read_csv('sp500_joined_closes.csv')
    df_corr = df.corr()
    print(df_corr.head())
    df_corr.to_csv('sp500corr.csv')
    data1 = df_corr.values
    fig1 = plt.figure()
    ax1 = fig1.add_subplot(111)

    heatmap1 = ax1.pcolor(data1, cmap=plt.cm.RdYlGn)
    fig1.colorbar(heatmap1)

    ax1.set_xticks(np.arange(data1.shape[1]) + 0.5, minor=False)
    ax1.set_yticks(np.arange(data1.shape[0]) + 0.5, minor=False)
    ax1.invert_yaxis()
    ax1.xaxis.tick_top()
    column_labels = df_corr.columns
    row_labels = df_corr.index
    ax1.set_xticklabels(column_labels)
    ax1.set_yticklabels(row_labels)
    plt.xticks(rotation=90)
    heatmap1.set_clim(-1, 1)
    plt.tight_layout()
    plt.show()


def process_data_for_labels(ticker):
    hm_days = 7
    df = pd.read_csv('sp500_joined_closes.csv', index_col=0)
    tickers = df.columns.values.tolist()
    df.fillna(0, inplace=True)
    for i in range(1, hm_days+1):
        df['{}_{}d'.format(ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
    df.fillna(0, inplace=True)
    return tickers, df


def buy_sell_hold(*args):
    cols = [c for c in args]
    requirement = 0.02
    for col in cols:
        if col > requirement:
            return 1
        if col < -requirement:
            return -1
    return 0


def extract_featuresets(ticker):
    tickers, df = process_data_for_labels(ticker)

    df['{}_target'.format(ticker)] = list(map( buy_sell_hold,
                                               df['{}_1d'.format(ticker)],
                                               df['{}_2d'.format(ticker)],
                                               df['{}_3d'.format(ticker)],
                                               df['{}_4d'.format(ticker)],
                                               df['{}_5d'.format(ticker)],
                                               df['{}_6d'.format(ticker)],
                                               df['{}_7d'.format(ticker)]))

    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print('Data spread:', Counter(str_vals))

    df.fillna(0, inplace=True)
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)

    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], 0)
    df_vals.fillna(0, inplace=True)

    X = df_vals.values
    y = df['{}_target'.format(ticker)].values
    return X, y, df


def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25)

    clf = VotingClassifier([('lsvc', svm.LinearSVC()),
                            ('knn', neighbors.KNeighborsClassifier()),
                            ('rfor', RandomForestClassifier())])

    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print('accuracy:', confidence)
    predictions = clf.predict(X_test)
    print('predicted class counts:', Counter(predictions))
    print()
    print()
    return confidence


# examples of running:
do_ml('XOM')
do_ml('AAPL')
do_ml('ABT')

We can also run this against all tickers:

from statistics import mean

with open("sp500tickers.pickle","rb") as f:
    tickers = pickle.load(f)

accuracies = []
for count,ticker in enumerate(tickers):

    if count%10==0:
        print(count)

    accuracy = do_ml(ticker)
    accuracies.append(accuracy)
    print("{} accuracy: {}. Average accuracy:{}".format(ticker,accuracy,mean(accuracies)))

This will take a while. I went ahead and did it, the result was an average accuracy of 46.279%. Not bad, but, from my looking around, the results are still questionable with any sort of strategy.

In the next tutorials, we're going to be diving into testing trading strategies.

The next tutorial:





  • Intro and Getting Stock Price Data - Python Programming for Finance p.1
  • Handling Data and Graphing - Python Programming for Finance p.2
  • Basic stock data Manipulation - Python Programming for Finance p.3
  • More stock manipulations - Python Programming for Finance p.4
  • Automating getting the S&P 500 list - Python Programming for Finance p.5
  • Getting all company pricing data in the S&P 500 - Python Programming for Finance p.6
  • Combining all S&P 500 company prices into one DataFrame - Python Programming for Finance p.7
  • Creating massive S&P 500 company correlation table for Relationships - Python Programming for Finance p.8
  • Preprocessing data to prepare for Machine Learning with stock data - Python Programming for Finance p.9
  • Creating targets for machine learning labels - Python Programming for Finance p.10 and 11
  • Machine learning against S&P 500 company prices - Python Programming for Finance p.12
  • Testing trading strategies with Quantopian Introduction - Python Programming for Finance p.13
  • Placing a trade order with Quantopian - Python Programming for Finance p.14
  • Scheduling a function on Quantopian - Python Programming for Finance p.15
  • Quantopian Research Introduction - Python Programming for Finance p.16
  • Quantopian Pipeline - Python Programming for Finance p.17
  • Alphalens on Quantopian - Python Programming for Finance p.18
  • Back testing our Alpha Factor on Quantopian - Python Programming for Finance p.19
  • Analyzing Quantopian strategy back test results with Pyfolio - Python Programming for Finance p.20
  • Strategizing - Python Programming for Finance p.21
  • Finding more Alpha Factors - Python Programming for Finance p.22
  • Combining Alpha Factors - Python Programming for Finance p.23
  • Portfolio Optimization - Python Programming for Finance p.24
  • Zipline Local Installation for backtesting - Python Programming for Finance p.25
  • Zipline backtest visualization - Python Programming for Finance p.26
  • Custom Data with Zipline Local - Python Programming for Finance p.27
  • Custom Markets Trading Calendar with Zipline (Bitcoin/cryptocurrency example) - Python Programming for Finance p.28