= (
gist_url "https://gist.githubusercontent.com/patrick-weiss/"
"86ddef6de978fbdfb22609a7840b5d8b/raw/"
"8fbcc6c6f40f537cd3cd37368be4487d73569c6b/"
)
with httpimport.remote_repo(gist_url):
from clean_enhanced_TRACE_python import clean_enhanced_trace
Clean Enhanced TRACE with Python
This appendix contains code to clean enhanced TRACE with Python. It is also available via the following GitHub gist. Hence, you could also source the file with the following chunk.
We need this function in TRACE and FISD to download and clean enhanced TRACE trade messages following Dick-Nielsen (2009) and Dick-Nielsen (2014) for enhanced TRACE specifically. This code is based on the resources provided by the project Open Source Bond Asset Pricing and their related publication Dickerson, Mueller, and Robotti (2023). We encourage that you acknowledge their effort. Relatedly, WRDS provides SAS code to clean enhanced TRACE data.
The function takes a vector of CUSIPs (in cusips
), a connection to WRDS (connection
) explained in Chapter 3, and a start and end date (start_date
and end_date
, respectively). Specifying too many CUSIPs will result in very slow downloads and a potential failure due to the size of the request to WRDS. The dates should be within the coverage of TRACE itself, i.e., starting after 2002, and the dates should be supplied as a string indicating MM/DD/YYYY. The output of the function contains all valid trade messages for the selected CUSIPs over the specified period.
def clean_enhanced_trace(cusips,
connection, ="'01/01/2002'",
start_date="'12/31/2023'"):
end_date"""Clean enhanced TRACE data."""
import pandas as pd
import numpy as np
# Load main file
= (
trace_query "SELECT cusip_id, bond_sym_id, trd_exctn_dt, "
"trd_exctn_tm, days_to_sttl_ct, lckd_in_ind, "
"wis_fl, sale_cndtn_cd, msg_seq_nb, "
"trc_st, trd_rpt_dt, trd_rpt_tm, "
"entrd_vol_qt, rptd_pr, yld_pt, "
"asof_cd, orig_msg_seq_nb, rpt_side_cd, "
"cntra_mp_id, stlmnt_dt, spcl_trd_fl "
"FROM trace.trace_enhanced "
f"WHERE cusip_id IN {cusips} "
f"AND trd_exctn_dt BETWEEN {start_date} AND {end_date}‚"
)
= pd.read_sql_query(
trace_all =trace_query,
sql=connection,
con={"trd_exctn_dt","trd_rpt_dt", "stlmnt_dt"}
parse_dates
)
# Post 2012-06-02
## Trades (trc_st = T) and correction (trc_st = R)
= (trace_all
trace_post_TR "trc_st in ['T', 'R']")
.query("trd_rpt_dt >= '2012-06-02'")
.query(
)
# Cancellations (trc_st = X) and correction cancellations (trc_st = C)
= (trace_all
trace_post_XC "trc_st in ['X', 'C']")
.query("trd_rpt_dt >= '2012-06-02'")
.query("cusip_id", "msg_seq_nb", "entrd_vol_qt",
.get(["rptd_pr", "rpt_side_cd", "cntra_mp_id",
"trd_exctn_dt", "trd_exctn_tm"])
=True)
.assign(drop
)
## Cleaning corrected and cancelled trades
= (trace_post_TR
trace_post_TR ="left")
.merge(trace_post_XC, how"drop != True")
.query(="drop")
.drop(columns
)
# Reversals (trc_st = Y)
= (trace_all
trace_post_Y "trc_st == 'Y'")
.query("trd_rpt_dt >= '2012-06-02'")
.query("cusip_id", "orig_msg_seq_nb", "entrd_vol_qt",
.get(["rptd_pr", "rpt_side_cd", "cntra_mp_id",
"trd_exctn_dt", "trd_exctn_tm"])
=True)
.assign(drop={"orig_msg_seq_nb": "msg_seq_nb"})
.rename(columns
)
# Clean reversals
## Match the orig_msg_seq_nb of Y-message to msg_seq_nb of main message
= (trace_post_TR
trace_post ="left")
.merge(trace_post_Y, how"drop != True")
.query(="drop")
.drop(columns
)
# Pre 06-02-12
## Trades (trc_st = T)
= (trace_all
trace_pre_T "trd_rpt_dt < '2012-06-02'")
.query(
)
# Cancellations (trc_st = C)
= (trace_all
trace_pre_C "trc_st == 'C'")
.query("trd_rpt_dt < '2012-06-02'")
.query("cusip_id", "orig_msg_seq_nb", "entrd_vol_qt",
.get(["rptd_pr", "rpt_side_cd", "cntra_mp_id",
"trd_exctn_dt", "trd_exctn_tm"])
=True)
.assign(drop={"orig_msg_seq_nb": "msg_seq_nb"})
.rename(columns
)
# Remove cancellations from trades
## Match orig_msg_seq_nb of C-message to msg_seq_nb of main message
= (trace_pre_T
trace_pre_T ="left")
.merge(trace_pre_C, how"drop != True")
.query(="drop")
.drop(columns
)
# Corrections (trc_st = W)
= (trace_all
trace_pre_W "trc_st == 'W'")
.query("trd_rpt_dt < '2012-06-02'")
.query(
)
# Implement corrections in a loop
## Correction control
= len(trace_pre_W)
correction_control = len(trace_pre_W)
correction_control_last
## Correction loop
while (correction_control > 0):
# Create placeholder
## Only identifying columns of trace_pre_T (for joins)
= (trace_pre_T
placeholder_trace_pre_T "cusip_id", "trd_exctn_dt", "msg_seq_nb"])
.get([={"msg_seq_nb": "orig_msg_seq_nb"})
.rename(columns=True)
.assign(matched_T
)
# Corrections that correct some msg
= (trace_pre_W
trace_pre_W_correcting ="left")
.merge(placeholder_trace_pre_T, how"matched_T == True")
.query(="matched_T")
.drop(columns
)
# Corrections that do not correct some msg
= (trace_pre_W
trace_pre_W ="left")
.merge(placeholder_trace_pre_T, how"matched_T != True")
.query(="matched_T")
.drop(columns
)
# Create placeholder
## Only identifying columns of trace_pre_W_correcting (for anti-joins)
= (trace_pre_W_correcting
placeholder_trace_pre_W_correcting "cusip_id", "trd_exctn_dt", "orig_msg_seq_nb"])
.get([={"orig_msg_seq_nb": "msg_seq_nb"})
.rename(columns=True)
.assign(corrected
)
# Delete msgs that are corrected
= (trace_pre_T
trace_pre_T ="left")
.merge(placeholder_trace_pre_W_correcting, how"corrected != True")
.query(="corrected")
.drop(columns
)
# Add correction msgs
= pd.concat([trace_pre_T, trace_pre_W_correcting])
trace_pre_T
# Escape if no corrections remain or they cannot be matched
= len(trace_pre_W)
correction_control
if correction_control == correction_control_last:
break
else:
= len(trace_pre_W)
correction_control_last continue
# Reversals (asof_cd = R)
## Record reversals
= (trace_pre_T
trace_pre_R "asof_cd == 'R'")
.query("cusip_id", "trd_exctn_dt",
.sort_values(["trd_exctn_tm", "trd_rpt_dt", "trd_rpt_tm"])
)
## Prepare final data
= (trace_pre_T
trace_pre
.query("asof_cd == None | asof_cd.isnull() | asof_cd not in ['R', 'X', 'D']"
)"cusip_id", "trd_exctn_dt",
.sort_values(["trd_exctn_tm", "trd_rpt_dt", "trd_rpt_tm"])
)
## Add grouped row numbers
"seq"] = (trace_pre_R
trace_pre_R["cusip_id", "trd_exctn_dt", "entrd_vol_qt",
.groupby(["rptd_pr", "rpt_side_cd", "cntra_mp_id"])
.cumcount()
)
"seq"] = (trace_pre
trace_pre["cusip_id", "trd_exctn_dt", "entrd_vol_qt",
.groupby(["rptd_pr", "rpt_side_cd", "cntra_mp_id"])
.cumcount()
)
## Select columns for reversal cleaning
= (trace_pre_R
trace_pre_R "cusip_id", "trd_exctn_dt", "entrd_vol_qt",
.get(["rptd_pr", "rpt_side_cd", "cntra_mp_id", "seq"])
=True)
.assign(reversal
)
## Remove reversals and the reversed trade
= (trace_pre
trace_pre ="left")
.merge(trace_pre_R, how"reversal != True")
.query(=["reversal", "seq"])
.drop(columns
)
# Combine pre and post trades
= pd.concat([trace_pre, trace_post])
trace_clean
# Keep agency sells and unmatched agency buys
= (trace_clean
trace_agency_sells "cntra_mp_id == 'D' & rpt_side_cd == 'S'")
.query(
)
# Placeholder for trace_agency_sells with relevant columns
= (trace_agency_sells
placeholder_trace_agency_sells "cusip_id", "trd_exctn_dt",
.get(["entrd_vol_qt", "rptd_pr"])
=True)
.assign(matched
)
# Agency buys that are unmatched
= (trace_clean
trace_agency_buys_filtered "cntra_mp_id == 'D' & rpt_side_cd == 'B'")
.query(="left")
.merge(placeholder_trace_agency_sells, how"matched != True")
.query(="matched")
.drop(columns
)
# Non-agency
= (trace_clean
trace_nonagency "cntra_mp_id == 'C'")
.query(
)
# Agency cleaned
= pd.concat([trace_nonagency,
trace_clean
trace_agency_sells,
trace_agency_buys_filtered])
# Additional Filters
= (trace_clean
trace_add_filters
.assign(= lambda x: (
days_to_sttl_ct2 "stlmnt_dt"]-x["trd_exctn_dt"]).dt.days
(x[
)
)
.assign(= lambda x: pd.to_numeric(
days_to_sttl_ct "days_to_sttl_ct"], errors='coerce'
x[
)
)"days_to_sttl_ct.isnull() | days_to_sttl_ct <= 7")
.query("days_to_sttl_ct2.isnull() | days_to_sttl_ct2 <= 7")
.query("wis_fl == 'N'")
.query("spcl_trd_fl.isnull() | spcl_trd_fl == ''")
.query("asof_cd.isnull() | asof_cd == ''")
.query(
)
# Only keep necessary columns
= (trace_add_filters
trace_final "cusip_id", "trd_exctn_dt", "trd_exctn_tm"])
.sort_values(["cusip_id", "trd_exctn_dt", "trd_exctn_tm", "rptd_pr",
.get(["entrd_vol_qt", "yld_pt", "rpt_side_cd", "cntra_mp_id"])
)
return trace_final