This code is an implementation of Volume Synchronized Probability of Informed Trading by Easley, Lopez de Prado, and O’Hara (2012) published in the Review of Financial Studies. Easley et al. argue that the CDF of VPIN indicates order flow toxicity. This is their explanation of the flash crash in 2010. You can see slides to my R/Finance 2014 presentation on the topic.
This version of the code is not particularly fast and they are plenty of opportunities for a better programmer than me to tune it up for speed.
#### VPIN calculation #########################################################
#install.packages('fasttime',repos='http://www.rforge.net/')
require(data.table); require(fasttime); require(plyr)
# Assuming TAQ data is arranged in 1 year stock csv files
stock=fread('/TAQ_data.csv'); stock=stock[,1:3,with=FALSE]
setnames(stock,colnames(stock),c('DateTime','Price','Volume'));
stock[,DateTime:=paste(paste(substr(DateTime,1,4),substr(DateTime,5,6),
substr(DateTime,7,8),sep='-'),substr(DateTime,10,17))]
setkey(stock,DateTime);
stock[,DateTime:=fastPOSIXct(DateTime,tz='GMT')]
stock=as.xts(stock)
# Now we have an xts data frame called 'stock' with a DateTime index and...
# two columns: Price and Volume
# Vbucket=Number of volume buckets in an average volume day (Vbucket=50)
VPIN=function(stock,Vbucket) {
stock$dP1=diff(stock[,'Price'],lag=1,diff=1,na.pad=TRUE)
ends=endpoints(stock,'minutes')
timeDF=period.apply(stock[,'dP1'],INDEX=ends,FUN=sum)
timeDF$Volume=period.apply(stock[,'Volume'],INDEX=ends,FUN=sum)
Vbar=mean(period.apply(timeDF[,'Volume'],INDEX=endpoints(timeDF,'days'),
FUN=sum))/Vbucket
timeDF$Vfrac=timeDF[,'Volume']/Vbar
timeDF$CumVfrac=cumsum(timeDF[,'Vfrac'])
timeDF$Next=(timeDF[,'CumVfrac']-floor(timeDF[,'CumVfrac']))/timeDF[,'Vfrac']
timeDF[timeDF[,'Next']<1,'Next']=0
timeDF$Previous=lag(timeDF[,'dP1'])*lag(timeDF[,'Next'])
timeDF$dP2=(1-timeDF[,'Next'])*timeDF[,'dP1'] + timeDF[,'Previous']
timeDF$Vtick=floor(timeDF[,'CumVfrac'])
timeDF[,'Vtick']=timeDF[,'Vtick']-diff(timeDF[,'Vtick']); timeDF[1,'Vtick']=0
timeDF=as.data.frame(timeDF); timeDF[,'DateTime']=row.names(timeDF)
timeDF=ddply(as.data.frame(timeDF),.(Vtick),last)
timeDF=as.xts(timeDF[,c('Volume','dP2','Vtick')],
order.by=fastPOSIXct(timeDF$DateTime,tz='GMT'))
timeDF[1,'dP2']=0
timeDF$sigma=rollapply(timeDF[,'dP2'],Vbucket,sd,fill=NA)
timeDF$sigma=na.fill(timeDF$sigma,"extend")
timeDF$Vbuy=Vbar*pnorm(timeDF[,'dP2']/timeDF[,'sigma'])
timeDF$Vsell=Vbar-timeDF[,'Vbuy']
timeDF$OI=abs(timeDF[,'Vsell']-timeDF[,'Vbuy'])
timeDF$VPIN=rollapply(timeDF[,'OI'],Vbucket,sum)/(Vbar*Vbucket)
timeDF=timeDF[,c('VPIN')]; return(timeDF)
}
out=VPIN(stock,50)
###############################################################################
Here is what the original file looks like:
1993-01-04 09:35:25,10.375,5300,40,0,,N
1993-01-04 09:36:49,10.375,25000,40,0,,N
1993-01-04 09:53:06,10.375,100,40,0,,N
1993-01-04 10:04:13,10.375,200,40,0,,N
1993-01-04 10:04:20,10.375,100,40,0,,N
1993-01-04 10:24:42,10.375,1000,40,0,,N
1993-01-04 10:25:19,10.375,600,40,0,,N
1993-01-04 11:31:04,10.5,10000,40,0,,N
1993-01-04 12:13:09,10.5,200,0,0,,M
1993-01-04 12:13:38,10.5,200,0,0,,M