load=ndlMacroDefine
run=ndlCreateNetwork

ndlMacroDefine=[
    # Macro definitions
    DelayNode(x)
    {
        D=Delay(x, delayInput=Dout, delayTime=1);
    }


    MeanVarNorm(x)=[
	xMean = Mean(x);
    	xStdDev = InvStdDev(x)
    	xNorm=PerDimMeanVarNormalization(x,xMean,xStdDev)
    ]
 
    #inline Rectified Linear Feed Forward
    RFF_R(x1, w1, b1)=RectifiedLinear(Plus(Times(w1,x1),b1))
    #Feed Forward
    FF(X1, W1, B1)
    [
        T=Times(W1,X1);
        P=Plus(T, B1);
    ]
    #Base feed Forward network, defines Bias and wieght parameters
    BFF(in, rows, cols)
    {
        B=Parameter(rows, init=fixedvalue, value=0)
        W=Parameter(rows, cols)
        FF = FF(in, w, b)
    }
    #RectifiedLinear Base Feed Forward
    RBFF(in,rowCount,colCount)
    {
        BFF = BFF(in, rowCount, colCount);
        RL = RectifiedLinear(BFF);
    }
    #Rectified Linear Feed Forward
    RFF(X2,W2,B2)=[
        FF = FF(X2, W2, B2);
        RL = RectifiedLinear(FF);
    ]
    #RectifiedLinear Feed Forward with Dropout
    RFFD(X3,W3,B3)
    {
        RFF=RFF(X3, W3, B3)
        DO=Dropout(RFF)
    }
    #Sigmoid Base Feed Forward
    SBFF(in,rowCount,colCount)
     {
        BFF = BFF(in, rowCount, colCount);
        S = Sigmoid(BFF);
    }
    #Sigmoid Feed Forward
    SFF(X2,W2,B2)=[
        FF = FF(X2, W2, B2);
        S = Sigmoid(FF);
    ]
    #Sigmoid Feed Forward with Dropout
    SFFD(X3,W3,B3)
    {
        SFF=SFF(X3, W3, B3)
        DO=Dropout(SFF)
    }
    #Sigmoid Feed Forward with Dropout
    SBFFD(inValue,rowCount,colCount)
    {
        SBFF=SBFF(inValue,rowCount,colCount)
        DO=Dropout(SBFF)
    }
    #SoftMax Feed Forward
    SMFF(x,y,z, labels)
    {
        FF = FF(x,y,z);
        SM = CrossEntropyWithSoftmax(labels, FF)
    }
    #SoftMax Base Feed Forward
    SMBFF(x,r,c, labels)
    {
        BFF = BFF(x,r,c);
        SM = CrossEntropyWithSoftmax(labels, BFF)
    }
    
    RFFD_R(x1, w1, b1)={Dropout(RectifiedLinear(Plus(Times(w1,x1),b1)))}

    LogPrior(labels)
    {
	Prior=Mean(labels)
	LogPrior=Log(Prior)
    }   

    #Sigmoid Base Feed Forward with multiple input
    SBFF_multi(in1, in2, rowCount, colCount1, colCount2)
     {
        BFF1 = BFF(in1, rowCount, colCount1);
        BFF2 = BFF(in2, rowCount, colCount2);
        B=Parameter(rowCount, init=fixedvalue, value=0.1)
        
        BFF_final = Plus(BFF1, BFF2)
        S = Sigmoid(BFF_final);
    }

     BFF_multi(in1, in2, rowCount, colCount1, colCount2)
     {
        BFF1 = BFF(in1, rowCount, colCount1);
        BFF2 = BFF(in2, rowCount, colCount2);
        B=Parameter(rowCount, init=fixedvalue, value=0.1)
        
        BFF_final = Plus(BFF1, BFF2)
    }

    LSTMComponent(inputDim, cellDim, inputx, cellDimX2, cellDimX3, cellDimX4, initScale, initBias)
    {
        wx = Parameter(cellDimX4, inputDim, init=uniform, initValueScale=initScale);
        b = Parameter(cellDimX4, init=fixedValue, value=initBias);
        Wh = Parameter(cellDimX4, cellDim, init=uniform, initValueScale=initScale);

        Wci = Parameter(cellDim, init=uniform, initValueScale=initScale);
        Wcf = Parameter(cellDim, init=uniform, initValueScale=initScale);
        Wco = Parameter(cellDim, init=uniform, initValueScale=initScale);

        dh = PastValue(cellDim, output, timeStep=1);
        dc = PastValue(cellDim, ct, timeStep=1);

        wxx = Times(wx, inputx);
        wxxpb = Plus(wxx, b);
        
        whh = Times(wh, dh);

		wxxpbpwhh = Plus(wxxpb,whh)	
				
		G1 = RowSlice(0, cellDim, wxxpbpwhh)		
		G2 = RowSlice(cellDim, cellDim, wxxpbpwhh)
		G3 = RowSlice(cellDimX2, cellDim, wxxpbpwhh);
		G4 = RowSlice(cellDimX3, cellDim, wxxpbpwhh);
		
        Wcidc = DiagTimes(Wci, dc);
        it = Sigmoid (Plus ( G1, Wcidc));

        bit = ElementTimes(it, Tanh( G2 ));

        Wcfdc = DiagTimes(Wcf, dc);
        ft = Sigmoid( Plus (G3, Wcfdc));

        bft = ElementTimes(ft, dc);

        ct = Plus(bft, bit);

        Wcoct = DiagTimes(Wco, ct);
        ot = Sigmoid( Plus( G4, Wcoct));

        output = ElementTimes(ot, Tanh(ct));
    }

]

ndlCreateNetwork=[

    inputDim=944
	featDim=2832  # should be inputDim * 3
    
    lookupDim=50
    lookupedDim=150  #should be lookupDim * 3 since we use lookupTableOrder of 3
    
	labelDim=127
	
	hiddenDim=300
	hiddenDimX2=600  #following three are not needed in BrainScript since they can computed by the compiler
	hiddenDimX3=900
	hiddenDimX4=1200
	
  	initScale=6
  	initBias=-1.0

	features=Input(featDim, tag=feature)
	labels=Input(labelDim, tag=label)

	# define network
    Wlookup = Parameter(lookupDim, inputDim, init=Uniform, initValueScale=initScale);
    featLookuped = LookupTable(Wlookup, features, init=Uniform, initValueScale=initScale);
    
#if you want to use 3 LSTM layers uncomment following and comment out the next block

    #LSTM Layer 1
    LSTMoutput1 = LSTMComponent(lookupedDim, hiddenDim, featLookuped, hiddenDimX2, hiddenDimX3, hiddenDimX4, initScale, initBias);
    #LSTM Layer 2
    LSTMoutput2 = LSTMComponent(hiddenDim, hiddenDim, LSTMoutput1, hiddenDimX2, hiddenDimX3, hiddenDimX4, initScale, initBias);
    #LSTM Layer 3
    LSTMoutput = LSTMComponent(hiddenDim, hiddenDim, LSTMoutput2, hiddenDimX2, hiddenDimX3, hiddenDimX4, initScale, initBias);
    
#else if you want to use just 1 LSTM layer uncomment following and comment out the above block

    #LSTM Layer 1
    LSTMoutput = LSTMComponent(lookupedDim, hiddenDim, featLookuped, hiddenDimX2, hiddenDimX3, hiddenDimX4, initScale, initBias);

#end if

    W = Parameter(labelDim, hiddenDim, init=Uniform, initValueScale=initScale);
    
#if you want to include a bias at last layer uncomment following and comment the next block

#	b = Parameter(labelDim, init=fixedvalue, value=0);
#	outvalue = Plus(Times(W, LSTMoutput3), b);

#else if you don't want to include a bias uncommment following and comment the above block

    outputs = Times(W, LSTMoutput3);
    
#end if
	
    cr = CrossEntropyWithSoftmax(labels, outputs);
    
    CriterionNodes = (cr)
    EvalNodes = (cr)
    OutputNodes = (outputs)
]