/* Do file for master thesis 2020 - Anette Normann & Emilie Plaum */

/* Note that corresponding Excel sheet file paths must be inserted where import is stated. Attached to this do-file there are two tab-delimited text files, which must be merged into one excel file (one sheet for each).*/

/*
 Firstly, the following commands must be executed to install the packages needed to run this scrips:
1) * ssc install ftools
2) * ssc install reghdfe
3) * ssc install esttab
4) * ssc install egenmore
5) * set scrollbufsize 300000 / Need to reload stata to activate

Press Ctrl + D to run entire script. 

*/


clear all
version 16.1 //This script needs version 16.1 of STATA to run.


///////////////////////////////////////////////////////////////////
/* Create graph for historical  Market values and number of ETFs */
///////////////////////////////////////////////////////////////////

/*
Import Excel file - this is the sample of ETFs from 1993 to 2019
*/

#delimit ; // Set ; as delimiter for end of command.
import excel "C:\Users\anett\HANDELSHØYSKOLEN BI (NBSSTD)\Plaum, Emilie - Master Thesis\Regression\Variables in the regression\All variables in one sheet_sharecode 10_11\Full dataset.xlsx", sheet("Market value ETFs 1993-2019") firstrow ;

gen MV = MarketValue/1000000 ;

graph bar (sum) MV if Year > 1999, 
	over(Year, lab(labsize(small) angle(45))) 
	ysize(3.6)
	xsize(5.51)
	ytick(#3, grid) 
	ylabel(,labsize(vsmall) angle(0)) 
	ytitle("\$ U.S. Billions") 
	blabel(bar, position(outside) size(vsmall) format(%9.0f) color(black))  
	note("Source:""Wharton Research Data Services. CRSP Monthly Stock File." 		
	"wrds.wharton.upenn.edu", size(vsmall) margin(medsmall)) 
	saving(MV_ETFs,replace) 
	name(MV_ETFs)
	 ;

graph bar (count) Count if Year > 1999, 
	over(Year, lab(angle(45))) 
	ysize(3.6)
	xsize(5.51)
	ytick(#3, grid) 
	ylabel(,labsize(small)  angle(0)) ytitle("Frequency") 
	blabel(bar, position(outside) size(small) format(%9.0f) color(black))
	note("Source:""Wharton Research Data Services. CRSP Monthly Stock File." 		
	"wrds.wharton.upenn.edu", size(vsmall) margin(medsmall)) 
	saving(Number_ETFs,replace) 
	name(Number_ETFs)
	 ;
	

#delimit cr
clear  // The above section is finished.

 

////////////////////////////////////////////////////////
/*PREPARE SAMPLE DATA FOR GRAPHS OF SUMMARY STATISTICS*/
////////////////////////////////////////////////////////



/*
Import Excel File - this is the main sample 
*/
#delimit ; // Set ; as delimiter for end of command.

import excel "C:\Users\anett\HANDELSHØYSKOLEN BI (NBSSTD)\Plaum, Emilie - Master Thesis\Regression\Variables in the regression\All variables in one sheet_sharecode 10_11\Full dataset.xlsx", sheet("All variables") firstrow ;


/*Rename variables to names that are quick to write, then make accompanying labels*/ 

rename (YEAR MONTHNUM DATE PERMNO TICKER COMNAM CUSIP ETFOWN MCAP DVOL RDVOL IP BASPREAD 		AMIHUD BTM GPROFIT P12MRET SQRDRET)
		(year monthnum fulldate permno tic comp cusip etfown mcap dvol rdvol ip basprd 			illiq btm gprofit p12mret sqrdret) ;

lab var year "Year" ;
lab var fulldate "MM/DD/YYYY" ;
lab var comp "Company Name" ;
lab var etfown "ETF Ownership" ;
lab var mcap "Market Capitalization" ;
lab var dvol "Daily Volatility" ;
lab var rdvol "Realized Daily Volatility" ;
lab var illiq "Amihud (2002)" ;
lab var basprd "Bid-Ask Spread" ;
lab var btm "Book-to-Market" ;
lab var gprofit "Gross Profitability" ;
lab var ip "Inverse Price" ;
lab var p12mret "Past 12-month returns" ;
lab var sqrdret "Squared Intraday Returns" ;

/* 
Generate the logged market capitalization from the mcap variable 
*/
gen lmcap = log(mcap) ;
lab var lmcap "Logged Market Capitalization" ;
move lmcap illiq ;


/*
Create a monthly time variable that STATA understands 
*/
gen int date = ym(year, monthnum) ;
move date permno ;
lab var date "Date" ;
format date %tm ;

/*
Sort data by year
*/
sort year ;


/*
Generate the daily volatility, ETF ownership and bid-ask spread variables in percentage terms (acc. to Ben-David et al. 2018)
*/
gen dvolp = dvol*100 ;
lab var dvolp "Daily Volatility (%)" ;
move dvolp mcap ;

gen etfownp = etfown*100 ;
lab var etfownp "ETF Ownership (%)" ;
move etfownp dvolp ;

gen basprdp = basprd*100 ;
lab var basprdp "Bid-Ask Spread (%)" ;
move basprdp btm ;


#delimit cr

/*
Summary statistics and correlation matrix 
*/
summarize dvolp etfown lmcap ip illiq basprdp btm gprofit p12mret, detail

/*
Estimate correlation matrix
*/
correlate dvolp etfownp lmcap ip illiq basprdp btm gprofit p12mret 

/*
We sort our sample by firm and month/year (date)
*/
sort permno date 

/*
Declear our dataset as panel data, with permno as the id and date as time variable, let stata know that our observations are monthly.
*/
xtset permno date, monthly  // but with gaps

/*
Look at the distribution of the observations of our panel
*/
xtdescribe 

/* 
We have two variables that we know leave and re-enter the S&P 500 during our time period, we choose to eliminate these from our sample such that we avoid gaps in the panel
*/

drop if permno == 61241 
drop if permno == 68591 

/* 
Run the panel data declaration again to make sure we have eliminated the gaps
*/
xtset permno date, monthly 

/*
Look at the distribution of the observations of our panel
*/
xtdescribe 

/*
New summary statistics and correlation matrix for final sample
*/
summarize dvolp etfown lmcap ip illiq basprdp btm gprofit p12mret, detail

/*
Estimate correlation matrix
*/
correlate dvolp etfownp lmcap ip illiq basprdp btm gprofit p12mret 



/*////////////////////////////////////////////////////////////////
/ THE FOLLOWING PART CORRESPONDS TO CHAPTER 4 IN THE REPORT      /
/////////////////////////////////////////////////////////////// */

bysort year: summarize etfown, detail

bysort year: summarize dvolp, detail


#delimit ; 

graph box dvolp, over(year)
	ysize(3.6)
	xsize(5.51)
	note("Source: CRSP Daily Stock File"
	"wrds.wharton.upenn.edu", size(vsmall) margin(medsmall)) 
	name("Daily_Volatility_Box_Plot") 
	saving(dvolpbox, replace) 
	;

twoway (tsline sqrdret),
	name("Volatility_Clustering")
	saving(volclus, replace) 
	;

#delimit cr

/*
Assign percentiles to the dependent variable, for each month. 
*/
egen numdate = group(date)
egen percentile = xtile(dvolp), by(numdate) nq(100) //This takes a while.

/*
Standardardizing variables to ease interpretation
*/
foreach var of varlist dvolp etfownp  { 
egen z`var'=std(`var'), mean(0) std(1)
 } 


/*
Prepare variables for regression:
1) Cointegration test, Kao (1999)
H_0: No Cointegration
H_A: All panels are cointegrated
*/

xtcointtest kao zdvolp zetfownp lmcap illiq basprdp btm gprofit ip p12mret
// Test result: p-value = 0.0000 (for all tests) < 0.05 -> Reject H_0.

/*
2) Woolridge test for serial correlation in panel-data models (Drukker (2003) and Woolridge (2002))
H_0: No first-order autocorrelation
H_A: Cannot reject null
*/
xtserial zdvolp zetfownp lmcap illiq basprd btm gprofit ip p12mret, output
// Test result: p-value = 0.0000 < 0.05 -> Reject H_0 -> autocorrelation is present.
 
/* 
Create macros for the dependent and independent variables to save time and space
*/
global ylist zdvolp 
global xlist zetfownp L.lmcap L.ip L.illiq L.basprdp L.btm L.gprofit L.p12mret 


/*
Begin by estimating a  simple pooled regression with neither fixed nor random effects.
Here we are ignoring the panel structure of our dataset
*/
regress $ylist $xlist

/*
Hausman test for fixed effects versus random effects model
*/
xtreg $ylist $xlist , fe
estimates store fixed
xtreg $ylist $xlist , re
estimates store random
hausman fixed random, sigmamore 

// Prob > F = 0.0000 < 0.05 --> Implies fixed effects model is to be used

xtreg $ylist $xlist i.date, fe

/*
Test if time fixed effects are needed when running a fixed effects model.
Wald:
	h_0: Coefficients for all years are jointly equal to zero
	h_a: Coefficients for all years are not jointly equal to zero.
*/
testparm i.date // Prob > F = 0.0000 < 0.05 --> reject null hypothesis, include time-fixed effects



///////////////////////////////////////////////////////////////////////////
/*THE FOLLOWING PART OF THE SCRIPT CORRESPONDS TO CHAPTER 5 IN THE REPORT*/
///////////////////////////////////////////////////////////////////////////


/////////////////
/* FULL SAMPLE */
/////////////////


/* 
(1): 
daily volatility, etf ownership and controls
time- and stock-fixed effects
two-way clustered standard errors
*/
reghdfe $ylist $xlist, absorb(permno date) vce(cluster permno date) noconstant 

estimates store r1

/*
(3): 
daily volatility, etf ownership and controls + 3 lags of volatility
time- and stock-fixed effects
two-way clustered standard errors
*/

/* 
Control how many lags are appropriate, if p < 0.05 we include the lags in the model
*/
global lagdvolp L.zdvolp L2.zdvolp L3.zdvolp L4.zdvolp L5.zdvolp L6.zdvolp L7.zdvolp L8.zdvolp L9.zdvolp L10..zdvolp L11.zdvolp L12.zdvolp 

reghdfe $ylist $xlist $lagdvolp, absorb(permno date) vce(cluster permno date) noconstant 

/*
Employ 3 lags of the dependent variable
*/

global lagdvolp L.zdvolp L2.zdvolp L3.zdvolp 

reghdfe $ylist $xlist $lagdvolp, absorb(permno date) vce(cluster permno date) noconstant 

estimates store r2

/*
(2):
daily volatility, etf ownership and controls 
time- and stock-fixed effects
two-way clustered standard errors
*/

reghdfe $ylist $xlist if L3.zdvolp > . | L3.zdvolp < ., absorb(permno date) vce(cluster permno date) noconstant 
estimates store r3

/*
Table with results
Note that table does not report stars at the same level of significance as our report
*/
	
esttab r1 r3 r2, t scalars(F df_m df_r) label title(Combination) nonumbers mtitles("r1""r3""r2") 



//////////////////////////
/* MAGNITUDE ESTIMATION */
//////////////////////////

/* 
Full sample
*/

summarize zdvolp if numdate == 132, detail

summarize zetfownp if percentile == 50 | percentile == 51  //Standard deviation of ETF ownership across stocks and time. 


/*
2008-2009jan
*/
summarize etfown if percentile == 50 & numdate < 19 | percentile == 51 & numdate < 19


/*
2010-2018
*/

summarize etfown if percentile == 50 & numdate > 18 | percentile == 51 & numdate > 18



/////////////////////////////////////
/* ROBUSTNESS: REALIZED VOLATILITY */
/////////////////////////////////////


gen rdvolp = rdvol*100

summarize rdvolp, detail
corr rdvolp etfownp lmcap ip illiq basprd btm gprofit p12mret


foreach var of varlist rdvolp  { 
egen z`var'=std(`var'), mean(0) std(1)
 } 
 
global ylist zrdvolp 
global xlist zetfownp L.lmcap L.ip L.illiq L.basprdp L.btm L.gprofit L.p12mret 

reghdfe $ylist $xlist, absorb(permno date) vce(cluster permno date) noconstant 
estimates store r4

global lagrdvolp L.zrdvolp L2.zrdvolp L3.zrdvolp 

reghdfe $ylist $xlist $lagrdvolp, absorb(permno date) vce(cluster permno date) noconstant 
estimates store r5

reghdfe $ylist $xlist if L3.zrdvolp > . | L3.zrdvolp < ., absorb(permno date) vce(cluster permno date) noconstant 
estimates store r6

/*
Table with results
Note that table does not report stars at the same level of significance as our report
*/
	
esttab r4 r6 r5, t scalars(F df_m df_r) label title(Combination) nonumbers mtitles("r4""r6""r5") 



////////////////////////////////////////
/* ROBUSTNESS: SAMPLE SPLIT BY PERIOD */
////////////////////////////////////////


/////////////////
/* SHORT SAMPLE /
/////////////////

Adjustment to variables for 2008- june 2009 sample:
Need to adjust standardized variables to mean and std.dev. of sample period.
*/

summarize dvolp etfown lmcap ip illiq basprd btm gprofit p12mret if numdate < 19, detail

corr dvolp etfownp lmcap ip illiq basprd btm gprofit p12mret if numdate < 19

foreach var of varlist dvolp etfownp  { 
egen shortz`var'=std(`var') if numdate < 19, mean(0) std(1)
 } 

global ylist shortzdvolp 
global xlist shortzetfownp L.lmcap L.ip L.illiq L.basprdp L.btm L.gprofit L.p12mret 
 
/* 
(1.2.1): 
PERIOD: 2008- june 2009
daily volatility, etf ownership and controls
time- and stock-fixed effects
two-way clustered standard errors
*/

reghdfe $ylist $xlist, absorb(permno date) vce(cluster permno) noconstant
estimates store r7

/*
(3.2.1)
PERIOD: 2008- june 2009
daily volatility, etf ownership and controls + 3 lags of volatility
time- and stock-fixed effects
two-way clustered standard errors
*/

global lagdvolp L1.shortzdvolp L2.shortzdvolp L3.shortzdvolp 

reghdfe $ylist $xlist $lagdvolp, absorb(permno date) vce(cluster permno) noconstant
estimates store r8

/*
(2.2.2)
PERIOD: 2008- june 2009
daily volatility, etf ownership and controls 
time- and stock-fixed effects
two-way clustered standard errors
*/

/* 
Three lags of dependent variables
*/

reghdfe $ylist $xlist if L3.shortzdvolp > . | L3.shortzdvolp < ., absorb(permno date) vce(cluster permno) noconstant

estimates store r9


/*
Table with results
PERIOD: 2008- june 2009
Note that table does not report stars at the same level of significance as our report
*/
esttab r7 r9 r8, t scalars(F df_m df_r) label title(Combination) nonumbers mtitles("r7""r9""r8") 



////////////////
/* LONG SAMPLE /
////////////////

Adjustment to variables for july 2009-2018 sample:
Need to adjust standardized variables to mean and std.dev. of sample period.
*/

summarize dvolp etfown lmcap ip illiq basprd btm gprofit p12mret if numdate > 18, detail
corr dvolp etfownp lmcap ip illiq basprd btm gprofit p12mret if numdate > 18


foreach var of varlist dvolp etfownp  { 
egen longz`var'=std(`var') if numdate > 18, mean(0) std(1)
 } 
 

global ylist longzdvolp 
global xlist longzetfownp L.lmcap L.ip L.illiq L.basprdp L.btm L.gprofit L.p12mret 
 

/*
(1.2.2): 
PERIOD: july 2009-2018
daily volatility, etf ownership and controls
time- and stock-fixed effects
two-way clustered standard errors
*/
reghdfe $ylist $xlist, absorb(permno date) vce(cluster permno) noconstant
estimates store r10

/*
(3.2.2)
PERIOD: july 2009-2018
daily volatility, etf ownership and controls + 3 lags of volatility
time- and stock-fixed effects
two-way clustered standard errors
*/

global lagdvolp L1.longzdvolp L2.longzdvolp L3.longzdvolp

reghdfe $ylist $xlist $lagdvolp, absorb(permno date) vce(cluster permno date) noconstant
estimates store r11

/*
(2.2.2)
PERIOD: july 2009-2018
daily volatility, etf ownership and controls 
time- and stock-fixed effects
two-way clustered standard errors
*/

reghdfe $ylist $xlist if L3.longzdvolp > . | L3.longzdvolp < ., absorb(permno date) vce(cluster permno date) noconstant
estimates store r12

/*
Table with results
PERIOD: july 2009-2018
Note that table does not report stars at the same level of significance as our report
*/
esttab r10 r12 r11, t scalars(F df_m df_r) label title(Combination) nonumbers mtitles("r10""r12""r11")