from IPython.display import Image
from pandas.tseries.offsets import BDay
import os
# Import config file to get database info
import configparser
# Use shutil to copy charts to pattern directory
import shutil
import datetime
import time
import math
import statistics
import numpy as np
import pandas as pd
import pytz
import sqlalchemy
# Show pretty progress bars
from tqdm import tqdm
#Needed for 3D plotting
from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
# Choose a seaborn palette
sns.set_palette("pastel")
# Various customizations for matplotlib and seaborn to get pretty plots
small_font_size = 12
medium_font_size = 14
large_font_size = 16
plt.rc('font', size=medium_font_size) # controls default text sizes
plt.rc('axes', titlesize=medium_font_size) # fontsize of the axes title
plt.rc('axes', labelsize=medium_font_size) # fontsize of the x and y labels
plt.rc('xtick', labelsize=medium_font_size) # fontsize of the tick labels
plt.rc('ytick', labelsize=medium_font_size) # fontsize of the tick labels
plt.rc('legend', fontsize=medium_font_size) # legend fontsize
plt.rc('figure', titlesize=large_font_size) # fontsize of the figure title
# Set matplotlib to always use the Inter font
plt.rcParams['font.family'] = "Inter"
# Load configuration
config = configparser.ConfigParser()
config.read("../config/config.cnf")
host = config['postgresql']['host']
user = config['postgresql']['user']
passwd = config['postgresql']['passwd']
db = config['postgresql']['db']
# Percentile list used below for analysis
percentiles = [.10, .20, .30, .40, .50, .60, .70, .80, .90]
# Optionally set a pattern here
pattern_name = ""
# Connect to database
engine = sqlalchemy.create_engine(f"postgresql+psycopg2://{user}:{passwd}@{host}:5432/{db}",
connect_args={'sslmode': 'require'},
echo=False)
# Read db table containing patterns
source_df = pd.read_sql('SELECT * FROM pt_output;', con=engine)
Show some basic info about the data set:
print(f"Rows in data set: {len(source_df.axes[0])}")
print("Columns:")
for item in source_df.columns:
print(item)
Rows in data set: 2289 Columns: date symbol pattern secondary_pattern chart_nr mkt_cap shs_float comment news_date news_headline news_link dilution_filing dilution_filing_date dilution_filing_file_number dilution_filing_link split_history sector industry country chart_name_eod chart_name_1m risk_level support_level gap-up_short_second_push news_classifier chart_name_manual traded_profit traded_loss short_at_open o h l c v o-1 h-1 l-1 c-1 v-1 close_sma_10 premkt_04:00:00_vwap premkt_04:00:00_volume_ma premkt_04:00:00_09:29:00_volume premkt_04:00:00_09:29:00_volume_max premkt_04:00:00_09:29:00_volume_min premkt_04:00:00_09:29:00_volume_vs_shs_float premkt_04:00:00_09:29:00_volume_vs_total_volume premkt_04:00:00_09:29:00_volume_mean_over_first_15_candles premkt_04:00:00_09:29:00_volume_mean_over_last_15_candles premkt_04:00:00_09:29:00_dollar_volume premkt_04:00:00_09:29:00_dollar_volume_vs_mkt_cap premkt_04:00:00_09:29:00_high premkt_04:00:00_09:29:00_high_t premkt_04:00:00_09:29:00_low premkt_04:00:00_09:29:00_low_t morning_09:30:00_vwap morning_09:30:00_volume_ma morning_09:30:00_11:59:00_volume morning_09:30:00_11:59:00_volume_max morning_09:30:00_11:59:00_volume_min morning_09:30:00_11:59:00_volume_vs_shs_float morning_09:30:00_11:59:00_volume_vs_total_volume morning_09:30:00_11:59:00_volume_mean_over_first_15_candles morning_09:30:00_11:59:00_volume_mean_over_last_15_candles morning_09:30:00_11:59:00_dollar_volume morning_09:30:00_11:59:00_dollar_volume_vs_mkt_cap morning_09:30:00_11:59:00_high morning_09:30:00_11:59:00_high_t morning_09:30:00_11:59:00_low morning_09:30:00_11:59:00_low_t afternoon_12:00:00_vwap afternoon_12:00:00_volume_ma afternoon_12:00:00_15:59:00_volume afternoon_12:00:00_15:59:00_volume_max afternoon_12:00:00_15:59:00_volume_min afternoon_12:00:00_15:59:00_volume_vs_shs_float afternoon_12:00:00_15:59:00_volume_vs_total_volume afternoon_12:00:00_15:59:00_volume_mean_over_first_15_candles afternoon_12:00:00_15:59:00_volume_mean_over_last_15_candles afternoon_12:00:00_15:59:00_dollar_volume afternoon_12:00:00_15:59:00_dollar_volume_vs_mkt_cap afternoon_12:00:00_15:59:00_high afternoon_12:00:00_15:59:00_high_t afternoon_12:00:00_15:59:00_low afternoon_12:00:00_15:59:00_low_t afthrs_16:00:00_vwap afthrs_16:00:00_volume_ma afthrs_16:00:00_19:59:00_volume afthrs_16:00:00_19:59:00_volume_max afthrs_16:00:00_19:59:00_volume_min afthrs_16:00:00_19:59:00_volume_vs_shs_float afthrs_16:00:00_19:59:00_volume_vs_total_volume afthrs_16:00:00_19:59:00_volume_mean_over_first_15_candles afthrs_16:00:00_19:59:00_volume_mean_over_last_15_candles afthrs_16:00:00_19:59:00_dollar_volume afthrs_16:00:00_19:59:00_dollar_volume_vs_mkt_cap afthrs_16:00:00_19:59:00_high afthrs_16:00:00_19:59:00_high_t afthrs_16:00:00_19:59:00_low afthrs_16:00:00_19:59:00_low_t o_pct h_pct l_pct c_pct premkt_04:00:00_09:29:00_high_vs_c-1_pct premkt_04:00:00_09:29:00_high_vs_o_pct morning_09:30:00_11:59:00_high_vs_c-1_pct morning_09:30:00_11:59:00_high_vs_o_pct afternoon_12:00:00_15:59:00_high_vs_c-1_pct afternoon_12:00:00_15:59:00_high_vs_o_pct afthrs_16:00:00_19:59:00_high_vs_c-1_pct afthrs_16:00:00_19:59:00_high_vs_o_pct premkt_04:00:00_09:29:00_low_vs_c-1_pct premkt_04:00:00_09:29:00_low_vs_o_pct morning_09:30:00_11:59:00_low_vs_c-1_pct morning_09:30:00_11:59:00_low_vs_o_pct afternoon_12:00:00_15:59:00_low_vs_c-1_pct afternoon_12:00:00_15:59:00_low_vs_o_pct afthrs_16:00:00_19:59:00_low_vs_c-1_pct afthrs_16:00:00_19:59:00_low_vs_o_pct close_sma_10_vs_c-1_pct close_sma_10_vs_o_pct close_sma_10_vs_c-1_pct_vs_c-1_pct close_sma_10_vs_c-1_pct_vs_o_pct close_sma_10_vs_o_pct_vs_c-1_pct close_sma_10_vs_o_pct_vs_o_pct
# Make sure that pandas knows that there's a date
# But don't use it as index because there the same date
# can appear multiple times
source_df["date"] = pd.to_datetime(source_df["date"])
# Sort by date
source_df.sort_values(by=['date'])
# Let's remove all rows which do not contain a pattern
# Not in this case, we let the data "guide us"
#df.dropna(subset=['pattern'])
#print(f"Rows in data set after removing rows with empty patterns: {df.count(axis='rows')[0]}")
# Some basic hygiene
source_df["sector"] = source_df["sector"].str.replace("Health Care", "Healthcare")
# The high should be before the low during 9:30 am to 12:00 am, only select matching rows
select_df = source_df[(source_df['morning_09:30:00_11:59:00_high_t'] < source_df['morning_09:30:00_11:59:00_low_t'])]
# Make copy to avoid "A value is trying to be set on a copy of a slice from a DataFrame."
df = select_df.copy()
# We have to further narrow down our search because some 1-minute data has been adjusted for splits while eod data has not
# This results in some stocks seemingly shooting up, e.g., 8200%
# It makes sense to, e.g., leave out stocks gapping up more than 200% since we rarely see this
# df = df[(df["pattern"] == 'gap-up_short') & (df["date"] > "2020-01-01") & (df["o_pct"] < 2) & (df['shs_float'] < 50) ]
print(f"Number of rows in selection: {len(df.axes[0])}")
Number of rows in selection: 1339
# Calculate percent change from high to low in a new df column
df['morning_09:30:00_11:59:00_high_vs_low_pct'] = df['morning_09:30:00_11:59:00_low'] / df['morning_09:30:00_11:59:00_high'] - 1
We care about the difference between the high and the low between 9:30 am and 12 pm, because that's where we would like to short stocks. A basic premise of the strategy is that a company will publish news to boost their stock, making it gap up. But which gap-up level gives us the best chance of a successful trade? Let's plot the correlation between 'morning_09:30:00_11:59:00_high_vs_low_pct' and 'o_pct'. Before that, let's look at them individually, starting with 'o_pct' (how much the stock gaps up from the previous close).
parameter = "o_pct"
s = df[parameter].describe(percentiles = percentiles)
plt.figure(figsize=(10,5))
print(s)
print(len(s))
print(f"Median: {statistics.median(s)}")
s = s.sort_values(ascending=True)
# Create plot
hp = sns.histplot(data = df[parameter][(df[parameter] < s[11])])
hp = sns.histplot(data = df[parameter][(df[parameter] < s[10])])
hp = sns.histplot(data = df[parameter][(df[parameter] < s[9])], kde=True)
count 1339.000000 mean 0.510562 std 1.271964 min -0.326000 10% -0.044000 20% 0.128000 30% 0.201000 40% 0.248200 50% 0.315000 60% 0.416800 70% 0.540600 80% 0.732000 90% 1.075600 max 39.900000 Name: o_pct, dtype: float64 14 Median: 0.4636811799850635
For now, we are only interested in positive values:
parameter = "o_pct"
lower_bound = 0
upper_bound = 2
s = df[parameter][(df[parameter] > lower_bound) & (df[parameter] < upper_bound)].describe(percentiles = percentiles)
plt.figure(figsize=(10,5))
print(s)
print(len(s))
print(f"Median: {statistics.median(s)}")
s = s.sort_values(ascending=True)
# Create plot
hp = sns.histplot(data = df[parameter][(df[parameter] > lower_bound) &(df[parameter] < s[11])])
# Zoom in
hp = sns.histplot(data = df[parameter][(df[parameter] > lower_bound) &(df[parameter] < s[10])])
# Zoom in some more
hp = sns.histplot(data = df[parameter][(df[parameter] > lower_bound) &(df[parameter] < s[9])], kde=True)
count 1120.000000 mean 0.498438 std 0.393432 min 0.002000 10% 0.147900 20% 0.202800 30% 0.242000 40% 0.294600 50% 0.372000 60% 0.457400 70% 0.595000 80% 0.759200 90% 1.038000 max 1.970000 Name: o_pct, dtype: float64 14 Median: 0.4254161403909903
Now for 'morning_09:30:00_11:59:00_high_vs_low_pct':
parameter = 'morning_09:30:00_11:59:00_high_vs_low_pct'
s = df[parameter].describe(percentiles = percentiles)
plt.figure(figsize=(10,5))
print(s)
print(len(s))
print(f"Median: {statistics.median(s)}")
s = s.sort_values(ascending=True)
# Create plot
hp = sns.histplot(data = df[parameter][(df[parameter] < s[12])])
hp = sns.histplot(data = df[parameter][(df[parameter] < s[11])])
hp = sns.histplot(data = df[parameter][(df[parameter] < s[10])], kde=True)
count 1339.000000 mean -0.239592 std 0.103178 min -0.803845 10% -0.376298 20% -0.327813 30% -0.283680 40% -0.250000 50% -0.219697 60% -0.198745 70% -0.175602 80% -0.151804 90% -0.125000 max -0.027495 Name: morning_09:30:00_11:59:00_high_vs_low_pct, dtype: float64 14 Median: -0.2092209415823183
And now the pair plot. We know from above that 90% of all values are below 1 for 'o_pct' so let's limit the data in the chart to, e.g., less than 120%.
pp = sns.pairplot(data=df[(df['o_pct'] > 0) & (df['o_pct'] < 1.2)],
y_vars=['o_pct'],
x_vars=['morning_09:30:00_11:59:00_high_vs_low_pct'],
#x_vars=[(df["shs_float"] < 30)],
height=5)
pp.fig.set_size_inches(10,10)
# Rotate y-lables for better readability
for ax in pp.axes.flatten():
# rotate y axis labels
ax.set_ylabel(ax.get_ylabel(), rotation = 0)
# set y labels alignment
ax.yaxis.get_label().set_horizontalalignment('right')
Let's also do a regression plot:
fig, ax = plt.subplots(figsize=(10,10))
rp = sns.regplot(y="o_pct", x="morning_09:30:00_11:59:00_high_vs_low_pct", data=df[(df['o_pct'] > 0) & (df['o_pct'] < 1.2)], ax=ax)
We see a nice correlation between the gap up and how much the stock falls from its high to the low which makes sense. Based on the analysis above we can now refine our selection from 'source_df'. Let's add 'o_pct' as criterion, between 0 and 2.
select_df = source_df[(source_df['morning_09:30:00_11:59:00_high_t'] < source_df['morning_09:30:00_11:59:00_low_t']) & (df['o_pct'] > 0) & (df['o_pct'] < 2)]
df = select_df.copy()
print(f"Number of rows in selection: {len(df.axes[0])}")
Number of rows in selection: 1120
It's a new df, so we have to calculate 'morning_09:30:00_11:59:00_high_vs_low_pct' again:
# Calculate percent change from high to low in a new df column
df['morning_09:30:00_11:59:00_high_vs_low_pct'] = df['morning_09:30:00_11:59:00_low'] / df['morning_09:30:00_11:59:00_high'] - 1
Display the data to check whether the selection is correct.
df[["date", "symbol", "morning_09:30:00_11:59:00_high_t", "morning_09:30:00_11:59:00_low_t", "morning_09:30:00_11:59:00_high_vs_low_pct", "morning_09:30:00_11:59:00_low", "morning_09:30:00_11:59:00_high"]]
date | symbol | morning_09:30:00_11:59:00_high_t | morning_09:30:00_11:59:00_low_t | morning_09:30:00_11:59:00_high_vs_low_pct | morning_09:30:00_11:59:00_low | morning_09:30:00_11:59:00_high | |
---|---|---|---|---|---|---|---|
5 | 2019-01-09 | VTVT | 09:46:00 | 10:46:00 | -0.151515 | 2.80 | 3.3000 |
14 | 2019-01-17 | LMFA | 09:53:00 | 11:35:00 | -0.322581 | 2.10 | 3.1000 |
16 | 2019-01-24 | BLNK | 09:37:00 | 09:56:00 | -0.186441 | 2.40 | 2.9500 |
20 | 2019-01-29 | AVCO | 09:35:00 | 11:05:00 | -0.246914 | 6.10 | 8.1000 |
21 | 2019-01-16 | TBLT | 09:34:00 | 11:19:00 | -0.400000 | 3.33 | 5.5500 |
... | ... | ... | ... | ... | ... | ... | ... |
2273 | 2023-01-25 | INPX | 09:47:00 | 11:21:00 | -0.251163 | 1.61 | 2.1500 |
2275 | 2023-01-26 | BZFD | 09:44:00 | 11:09:00 | -0.209099 | 1.21 | 1.5299 |
2279 | 2023-01-26 | BZFD | 09:44:00 | 11:09:00 | -0.209099 | 1.21 | 1.5299 |
2287 | 2023-02-02 | TENX | 09:31:00 | 11:48:00 | -0.381395 | 2.66 | 4.3000 |
2288 | 2023-02-02 | ZVSA | 09:30:00 | 10:43:00 | -0.238255 | 2.27 | 2.9800 |
1120 rows × 7 columns
# We have to further narrow down our search because some 1-minute data has been adjusted for splits while eod data has not
# This results in some stocks seemingly shooting up, e.g., 8200%
# It makes sense to, e.g., leave out stocks gapping up more than 200% since we rarely see this
# df = df[(df["pattern"] == 'gap-up_short') & (df["date"] > "2020-01-01") & (df["o_pct"] < 2) & (df['shs_float'] < 50) ]
Write the dataframe to a CSV file.
df.to_csv("df.csv")
Let's check out some fundamental data.
parameter = "sector"
s = df[parameter].describe(percentiles = percentiles)
print(s)
plt.figure(figsize=(30,5))
hp = sns.histplot(data = df[parameter].replace(np.nan, "n/a"))
# Rotate x-lables for better readability
hp.set_xticklabels(hp.get_xticklabels(), rotation=30)
count 1066 unique 18 top Healthcare freq 558 Name: sector, dtype: object
/var/folders/s9/wdvhc0790lbd65fxspb0_45m0000gn/T/ipykernel_86032/3865418329.py:10: UserWarning: FixedFormatter should only be used together with FixedLocator hp.set_xticklabels(hp.get_xticklabels(), rotation=30)
[Text(0, 0, 'Healthcare'), Text(1, 0, 'Financial'), Text(2, 0, 'Consumer Cyclical'), Text(3, 0, 'n/a'), Text(4, 0, 'Industrials'), Text(5, 0, 'Basic Materials'), Text(6, 0, 'Technology'), Text(7, 0, 'Communication Services'), Text(8, 0, 'Conglomerates'), Text(9, 0, 'Energy'), Text(10, 0, 'Services'), Text(11, 0, 'Information Technology'), Text(12, 0, 'Consumer Goods'), Text(13, 0, 'Industrial Goods'), Text(14, 0, 'Consumer Discretionary'), Text(15, 0, 'Utilities'), Text(16, 0, 'Consumer Defensive'), Text(17, 0, 'Real Estate'), Text(18, 0, '')]