Gap up short trade¶

In [ ]:
from IPython.display import Image
from pandas.tseries.offsets import BDay
import os
# Import config file to get database info
import configparser

# Use shutil to copy charts to pattern directory
import shutil
import datetime
import time
import math
import statistics
import numpy as np
import pandas as pd
import pytz
import sqlalchemy

# Show pretty progress bars
from tqdm import tqdm

#Needed for 3D plotting
from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
# Choose a seaborn palette
sns.set_palette("pastel")

# Various customizations for matplotlib and seaborn to get pretty plots
small_font_size = 12
medium_font_size = 14
large_font_size = 16

plt.rc('font', size=medium_font_size)          # controls default text sizes
plt.rc('axes', titlesize=medium_font_size)     # fontsize of the axes title
plt.rc('axes', labelsize=medium_font_size)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=medium_font_size)    # fontsize of the tick labels
plt.rc('ytick', labelsize=medium_font_size)    # fontsize of the tick labels
plt.rc('legend', fontsize=medium_font_size)    # legend fontsize
plt.rc('figure', titlesize=large_font_size)   # fontsize of the figure title

# Set matplotlib to always use the Inter font
plt.rcParams['font.family'] = "Inter"
In [ ]:
# Load configuration
config = configparser.ConfigParser()
config.read("../config/config.cnf")

host = config['postgresql']['host']
user = config['postgresql']['user']
passwd = config['postgresql']['passwd']
db = config['postgresql']['db']

Helper functions and variables¶

In [ ]:
# Percentile list used below for analysis
percentiles = [.10, .20, .30, .40, .50, .60, .70, .80, .90]

# Optionally set a pattern here
pattern_name = ""

Database connection(s)¶

In [ ]:
# Connect to database
engine = sqlalchemy.create_engine(f"postgresql+psycopg2://{user}:{passwd}@{host}:5432/{db}", 
                                       connect_args={'sslmode': 'require'}, 
                                       echo=False)

Read data from source¶

In [ ]:
# Read db table containing patterns
source_df = pd.read_sql('SELECT * FROM pt_output;', con=engine)

Show some basic info about the data set:

In [ ]:
print(f"Rows in data set: {len(source_df.axes[0])}")

print("Columns:")
for item in source_df.columns:
    print(item)
Rows in data set: 2289
Columns:
date
symbol
pattern
secondary_pattern
chart_nr
mkt_cap
shs_float
comment
news_date
news_headline
news_link
dilution_filing
dilution_filing_date
dilution_filing_file_number
dilution_filing_link
split_history
sector
industry
country
chart_name_eod
chart_name_1m
risk_level
support_level
gap-up_short_second_push
news_classifier
chart_name_manual
traded_profit
traded_loss
short_at_open
o
h
l
c
v
o-1
h-1
l-1
c-1
v-1
close_sma_10
premkt_04:00:00_vwap
premkt_04:00:00_volume_ma
premkt_04:00:00_09:29:00_volume
premkt_04:00:00_09:29:00_volume_max
premkt_04:00:00_09:29:00_volume_min
premkt_04:00:00_09:29:00_volume_vs_shs_float
premkt_04:00:00_09:29:00_volume_vs_total_volume
premkt_04:00:00_09:29:00_volume_mean_over_first_15_candles
premkt_04:00:00_09:29:00_volume_mean_over_last_15_candles
premkt_04:00:00_09:29:00_dollar_volume
premkt_04:00:00_09:29:00_dollar_volume_vs_mkt_cap
premkt_04:00:00_09:29:00_high
premkt_04:00:00_09:29:00_high_t
premkt_04:00:00_09:29:00_low
premkt_04:00:00_09:29:00_low_t
morning_09:30:00_vwap
morning_09:30:00_volume_ma
morning_09:30:00_11:59:00_volume
morning_09:30:00_11:59:00_volume_max
morning_09:30:00_11:59:00_volume_min
morning_09:30:00_11:59:00_volume_vs_shs_float
morning_09:30:00_11:59:00_volume_vs_total_volume
morning_09:30:00_11:59:00_volume_mean_over_first_15_candles
morning_09:30:00_11:59:00_volume_mean_over_last_15_candles
morning_09:30:00_11:59:00_dollar_volume
morning_09:30:00_11:59:00_dollar_volume_vs_mkt_cap
morning_09:30:00_11:59:00_high
morning_09:30:00_11:59:00_high_t
morning_09:30:00_11:59:00_low
morning_09:30:00_11:59:00_low_t
afternoon_12:00:00_vwap
afternoon_12:00:00_volume_ma
afternoon_12:00:00_15:59:00_volume
afternoon_12:00:00_15:59:00_volume_max
afternoon_12:00:00_15:59:00_volume_min
afternoon_12:00:00_15:59:00_volume_vs_shs_float
afternoon_12:00:00_15:59:00_volume_vs_total_volume
afternoon_12:00:00_15:59:00_volume_mean_over_first_15_candles
afternoon_12:00:00_15:59:00_volume_mean_over_last_15_candles
afternoon_12:00:00_15:59:00_dollar_volume
afternoon_12:00:00_15:59:00_dollar_volume_vs_mkt_cap
afternoon_12:00:00_15:59:00_high
afternoon_12:00:00_15:59:00_high_t
afternoon_12:00:00_15:59:00_low
afternoon_12:00:00_15:59:00_low_t
afthrs_16:00:00_vwap
afthrs_16:00:00_volume_ma
afthrs_16:00:00_19:59:00_volume
afthrs_16:00:00_19:59:00_volume_max
afthrs_16:00:00_19:59:00_volume_min
afthrs_16:00:00_19:59:00_volume_vs_shs_float
afthrs_16:00:00_19:59:00_volume_vs_total_volume
afthrs_16:00:00_19:59:00_volume_mean_over_first_15_candles
afthrs_16:00:00_19:59:00_volume_mean_over_last_15_candles
afthrs_16:00:00_19:59:00_dollar_volume
afthrs_16:00:00_19:59:00_dollar_volume_vs_mkt_cap
afthrs_16:00:00_19:59:00_high
afthrs_16:00:00_19:59:00_high_t
afthrs_16:00:00_19:59:00_low
afthrs_16:00:00_19:59:00_low_t
o_pct
h_pct
l_pct
c_pct
premkt_04:00:00_09:29:00_high_vs_c-1_pct
premkt_04:00:00_09:29:00_high_vs_o_pct
morning_09:30:00_11:59:00_high_vs_c-1_pct
morning_09:30:00_11:59:00_high_vs_o_pct
afternoon_12:00:00_15:59:00_high_vs_c-1_pct
afternoon_12:00:00_15:59:00_high_vs_o_pct
afthrs_16:00:00_19:59:00_high_vs_c-1_pct
afthrs_16:00:00_19:59:00_high_vs_o_pct
premkt_04:00:00_09:29:00_low_vs_c-1_pct
premkt_04:00:00_09:29:00_low_vs_o_pct
morning_09:30:00_11:59:00_low_vs_c-1_pct
morning_09:30:00_11:59:00_low_vs_o_pct
afternoon_12:00:00_15:59:00_low_vs_c-1_pct
afternoon_12:00:00_15:59:00_low_vs_o_pct
afthrs_16:00:00_19:59:00_low_vs_c-1_pct
afthrs_16:00:00_19:59:00_low_vs_o_pct
close_sma_10_vs_c-1_pct
close_sma_10_vs_o_pct
close_sma_10_vs_c-1_pct_vs_c-1_pct
close_sma_10_vs_c-1_pct_vs_o_pct
close_sma_10_vs_o_pct_vs_c-1_pct
close_sma_10_vs_o_pct_vs_o_pct

Data cleanup¶

In [ ]:
# Make sure that pandas knows that there's a date
# But don't use it as index because there the same date 
# can appear multiple times
source_df["date"] = pd.to_datetime(source_df["date"])
# Sort by date
source_df.sort_values(by=['date'])

# Let's remove all rows which do not contain a pattern
# Not in this case, we let the data "guide us"
#df.dropna(subset=['pattern'])
#print(f"Rows in data set after removing rows with empty patterns: {df.count(axis='rows')[0]}")

# Some basic hygiene
source_df["sector"] = source_df["sector"].str.replace("Health Care", "Healthcare")

Select data from source_df¶

In [ ]:
# The high should be before the low during 9:30 am to 12:00 am, only select matching rows
select_df = source_df[(source_df['morning_09:30:00_11:59:00_high_t'] < source_df['morning_09:30:00_11:59:00_low_t'])]

# Make copy to avoid "A value is trying to be set on a copy of a slice from a DataFrame."
df = select_df.copy()

# We have to further narrow down our search because some 1-minute data has been adjusted for splits while eod data has not
# This results in some stocks seemingly shooting up, e.g., 8200%
# It makes sense to, e.g., leave out stocks gapping up more than 200% since we rarely see this
# df = df[(df["pattern"] == 'gap-up_short') &  (df["date"] > "2020-01-01") &  (df["o_pct"] < 2) & (df['shs_float'] < 50) ]

print(f"Number of rows in selection: {len(df.axes[0])}")
Number of rows in selection: 1339
In [ ]:
# Calculate percent change from high to low in a new df column
df['morning_09:30:00_11:59:00_high_vs_low_pct'] = df['morning_09:30:00_11:59:00_low'] / df['morning_09:30:00_11:59:00_high'] - 1

How to find the right gap-up level¶

We care about the difference between the high and the low between 9:30 am and 12 pm, because that's where we would like to short stocks. A basic premise of the strategy is that a company will publish news to boost their stock, making it gap up. But which gap-up level gives us the best chance of a successful trade? Let's plot the correlation between 'morning_09:30:00_11:59:00_high_vs_low_pct' and 'o_pct'. Before that, let's look at them individually, starting with 'o_pct' (how much the stock gaps up from the previous close).

In [ ]:
parameter = "o_pct"

s = df[parameter].describe(percentiles = percentiles)
plt.figure(figsize=(10,5))
print(s)
print(len(s))

print(f"Median: {statistics.median(s)}")

s = s.sort_values(ascending=True)

# Create plot
hp = sns.histplot(data = df[parameter][(df[parameter] < s[11])])

hp = sns.histplot(data = df[parameter][(df[parameter] < s[10])])

hp = sns.histplot(data = df[parameter][(df[parameter] < s[9])], kde=True)
count    1339.000000
mean        0.510562
std         1.271964
min        -0.326000
10%        -0.044000
20%         0.128000
30%         0.201000
40%         0.248200
50%         0.315000
60%         0.416800
70%         0.540600
80%         0.732000
90%         1.075600
max        39.900000
Name: o_pct, dtype: float64
14
Median: 0.4636811799850635

For now, we are only interested in positive values:

In [ ]:
parameter = "o_pct"

lower_bound = 0
upper_bound = 2

s = df[parameter][(df[parameter] > lower_bound) & (df[parameter] < upper_bound)].describe(percentiles = percentiles)
plt.figure(figsize=(10,5))
print(s)
print(len(s))

print(f"Median: {statistics.median(s)}")

s = s.sort_values(ascending=True)

# Create plot
hp = sns.histplot(data = df[parameter][(df[parameter] > lower_bound) &(df[parameter] < s[11])])
# Zoom in
hp = sns.histplot(data = df[parameter][(df[parameter] > lower_bound) &(df[parameter] < s[10])])
# Zoom in some more
hp = sns.histplot(data = df[parameter][(df[parameter] > lower_bound) &(df[parameter] < s[9])], kde=True)
count    1120.000000
mean        0.498438
std         0.393432
min         0.002000
10%         0.147900
20%         0.202800
30%         0.242000
40%         0.294600
50%         0.372000
60%         0.457400
70%         0.595000
80%         0.759200
90%         1.038000
max         1.970000
Name: o_pct, dtype: float64
14
Median: 0.4254161403909903

Now for 'morning_09:30:00_11:59:00_high_vs_low_pct':

In [ ]:
parameter = 'morning_09:30:00_11:59:00_high_vs_low_pct'

s = df[parameter].describe(percentiles = percentiles)
plt.figure(figsize=(10,5))
print(s)
print(len(s))

print(f"Median: {statistics.median(s)}")

s = s.sort_values(ascending=True)

# Create plot
hp = sns.histplot(data = df[parameter][(df[parameter] < s[12])])

hp = sns.histplot(data = df[parameter][(df[parameter] < s[11])])

hp = sns.histplot(data = df[parameter][(df[parameter] < s[10])], kde=True)
count    1339.000000
mean       -0.239592
std         0.103178
min        -0.803845
10%        -0.376298
20%        -0.327813
30%        -0.283680
40%        -0.250000
50%        -0.219697
60%        -0.198745
70%        -0.175602
80%        -0.151804
90%        -0.125000
max        -0.027495
Name: morning_09:30:00_11:59:00_high_vs_low_pct, dtype: float64
14
Median: -0.2092209415823183

And now the pair plot. We know from above that 90% of all values are below 1 for 'o_pct' so let's limit the data in the chart to, e.g., less than 120%.

In [ ]:
pp = sns.pairplot(data=df[(df['o_pct'] > 0) & (df['o_pct'] < 1.2)],
                  y_vars=['o_pct'],
                  x_vars=['morning_09:30:00_11:59:00_high_vs_low_pct'],
                  #x_vars=[(df["shs_float"] < 30)],
                 height=5)
pp.fig.set_size_inches(10,10)

# Rotate y-lables for better readability
for ax in pp.axes.flatten():
    # rotate y axis labels
    ax.set_ylabel(ax.get_ylabel(), rotation = 0)
    # set y labels alignment
    ax.yaxis.get_label().set_horizontalalignment('right')

Let's also do a regression plot:

In [ ]:
fig, ax = plt.subplots(figsize=(10,10))
rp = sns.regplot(y="o_pct", x="morning_09:30:00_11:59:00_high_vs_low_pct", data=df[(df['o_pct'] > 0) & (df['o_pct'] < 1.2)], ax=ax)

We see a nice correlation between the gap up and how much the stock falls from its high to the low which makes sense. Based on the analysis above we can now refine our selection from 'source_df'. Let's add 'o_pct' as criterion, between 0 and 2.

In [ ]:
select_df = source_df[(source_df['morning_09:30:00_11:59:00_high_t'] < source_df['morning_09:30:00_11:59:00_low_t']) & (df['o_pct'] > 0) & (df['o_pct'] < 2)] 

df = select_df.copy()

print(f"Number of rows in selection: {len(df.axes[0])}")
Number of rows in selection: 1120

It's a new df, so we have to calculate 'morning_09:30:00_11:59:00_high_vs_low_pct' again:

In [ ]:
# Calculate percent change from high to low in a new df column
df['morning_09:30:00_11:59:00_high_vs_low_pct'] = df['morning_09:30:00_11:59:00_low'] / df['morning_09:30:00_11:59:00_high'] - 1

Display the data to check whether the selection is correct.

In [ ]:
df[["date", "symbol", "morning_09:30:00_11:59:00_high_t", "morning_09:30:00_11:59:00_low_t", "morning_09:30:00_11:59:00_high_vs_low_pct", "morning_09:30:00_11:59:00_low", "morning_09:30:00_11:59:00_high"]]
Out[ ]:
date symbol morning_09:30:00_11:59:00_high_t morning_09:30:00_11:59:00_low_t morning_09:30:00_11:59:00_high_vs_low_pct morning_09:30:00_11:59:00_low morning_09:30:00_11:59:00_high
5 2019-01-09 VTVT 09:46:00 10:46:00 -0.151515 2.80 3.3000
14 2019-01-17 LMFA 09:53:00 11:35:00 -0.322581 2.10 3.1000
16 2019-01-24 BLNK 09:37:00 09:56:00 -0.186441 2.40 2.9500
20 2019-01-29 AVCO 09:35:00 11:05:00 -0.246914 6.10 8.1000
21 2019-01-16 TBLT 09:34:00 11:19:00 -0.400000 3.33 5.5500
... ... ... ... ... ... ... ...
2273 2023-01-25 INPX 09:47:00 11:21:00 -0.251163 1.61 2.1500
2275 2023-01-26 BZFD 09:44:00 11:09:00 -0.209099 1.21 1.5299
2279 2023-01-26 BZFD 09:44:00 11:09:00 -0.209099 1.21 1.5299
2287 2023-02-02 TENX 09:31:00 11:48:00 -0.381395 2.66 4.3000
2288 2023-02-02 ZVSA 09:30:00 10:43:00 -0.238255 2.27 2.9800

1120 rows × 7 columns

In [ ]:
# We have to further narrow down our search because some 1-minute data has been adjusted for splits while eod data has not
# This results in some stocks seemingly shooting up, e.g., 8200%
# It makes sense to, e.g., leave out stocks gapping up more than 200% since we rarely see this
# df = df[(df["pattern"] == 'gap-up_short') &  (df["date"] > "2020-01-01") &  (df["o_pct"] < 2) & (df['shs_float'] < 50) ]

Write the dataframe to a CSV file.

In [ ]:
df.to_csv("df.csv")

Let's check out some fundamental data.

In [ ]:
parameter = "sector"

s = df[parameter].describe(percentiles = percentiles)
print(s)

plt.figure(figsize=(30,5))
hp = sns.histplot(data = df[parameter].replace(np.nan, "n/a"))

# Rotate x-lables for better readability
hp.set_xticklabels(hp.get_xticklabels(), rotation=30)
count           1066
unique            18
top       Healthcare
freq             558
Name: sector, dtype: object
/var/folders/s9/wdvhc0790lbd65fxspb0_45m0000gn/T/ipykernel_86032/3865418329.py:10: UserWarning: FixedFormatter should only be used together with FixedLocator
  hp.set_xticklabels(hp.get_xticklabels(), rotation=30)
Out[ ]:
[Text(0, 0, 'Healthcare'),
 Text(1, 0, 'Financial'),
 Text(2, 0, 'Consumer Cyclical'),
 Text(3, 0, 'n/a'),
 Text(4, 0, 'Industrials'),
 Text(5, 0, 'Basic Materials'),
 Text(6, 0, 'Technology'),
 Text(7, 0, 'Communication Services'),
 Text(8, 0, 'Conglomerates'),
 Text(9, 0, 'Energy'),
 Text(10, 0, 'Services'),
 Text(11, 0, 'Information Technology'),
 Text(12, 0, 'Consumer Goods'),
 Text(13, 0, 'Industrial Goods'),
 Text(14, 0, 'Consumer Discretionary'),
 Text(15, 0, 'Utilities'),
 Text(16, 0, 'Consumer Defensive'),
 Text(17, 0, 'Real Estate'),
 Text(18, 0, '')]