/* dim_op.c : engine of operations along a dimension (e.g. running mean)
 */

#include<stdio.h>
#include<string.h>
#include "ruby.h"
#include "narray.h"
#include<math.h>

#ifndef RARRAY_PTR
#  define RARRAY_PTR(ary) (RARRAY(ary)->ptr)
#endif
#ifndef RARRAY_LEN
#  define RARRAY_LEN(ary) (RARRAY(ary)->len)
#endif

/* for compatibility for NArray and NArray with big memory patch */
#ifndef NARRAY_BIGMEM
typedef int na_shape_t;
#endif

enum bc_type {
    BC_SIMPLE=10, 
    BC_CYCLIC=11, 
    BC_TRIM=12
};

#define ID3(i,j,k) ((i) + (j)*n0 + (k)*n0*n1)
#define ID3T(i,j,k) ((i) + (j)*n0 + (k)*n0*(n1-wlen+1))
#define ID3o(i,j,k) ((i) + (j)*n0 + (k)*n0*n1o)
#define ID3z(i,j,k) ((i) + (j)*n0 + (k)*n0*nz)
#define ID3c(i,j,k) ((i) + (j)*n0 + (k)*n0*nc)
#define ID3e(i,j,k,ni,nj,nk) (((i)%(ni)) + (j)*(ni) + ((k)%(nk))*(ni)*(nj))
         // ID3e: covers 1D case (if ni==1 or nk==1, i or k is neglected by %)
#define ID2(i,j) ((i) + (j)*n0)
#define ID5(i,j,k,l,m) ((i) + (j)*ni + (k)*ni*nd1 + (l)*ni*nd1*no1 + (m)*ni*nd1*no1*nd2)
#define NMax(i,j) ( (i) >= (j) ? (i) : (j) )
#define NMin(i,j) ( (i) <= (j) ? (i) : (j) )

static int
convol_result_length(na_shape_t n1, na_shape_t wlen, int ibc)
{
    int n1o;
    switch (ibc) {
    case BC_SIMPLE:
    case BC_CYCLIC:
	n1o = n1;
	break;
    case BC_TRIM:
	n1o = n1-wlen+1;
	break;
    }
    return n1o;
}

static void 
running_mean_nomiss(zi,n0,n1,n2, w,wlen, ibc, zo)
    // IN
    double *zi;
    na_shape_t n0;
    na_shape_t n1;
    na_shape_t n2;
    double *w;
    na_shape_t wlen;
    int ibc;
    // OUT
    double *zo;   //  must have been set to 0 everywhere
{
    na_shape_t i,j,k,m, jj;
    na_shape_t wl2s, wl2;
    double wsum;
    double *wc;     // scaled w (sum wc == 1)
    double *wcsum1,*wcsum2;  // for BC_SIMPLE; for shorter summation at edges
    double wcm;     // modified wcm for shorter summation

    wl2s = (wlen - 1)/2;  // e.g. 7->3, 6->2
    wl2 = wlen/2;         // e.g. 7->3, 6->3;   wl2s + wl2 == wlen - 1

    // < scale w to make its sum == 1 >

    for (m=0,wsum=0.0; m<wlen; m++) {
	wsum += w[m];
    }
    if (wsum != 1.0) {
	wc = ALLOCA_N(double, wlen);
	for (m=0; m<wlen; m++) {
	    wc[m] = w[m] / wsum;
	}
    } else {
	wc = w;
    }

    // < weighted running mean >

    switch (ibc) {
    case BC_SIMPLE:
	wcsum1 = ALLOCA_N(double, wl2s);
	wcsum1[0] = 1.0 - wc[0];              // sum of wc[1...wlen]
	for (m=1; m<wl2s; m++) {
	    wcsum1[m] = wcsum1[m-1] - wc[m];   // sum of wc[m+1...wlen]
	}
	wcsum2 = ALLOCA_N(double, wl2);
	wcsum2[0] = 1.0 - wc[wlen-1];         // sum of wc[0...wlen-1]
	for (m=wlen-2; m>wlen-1-wl2; m--) {
	    wcsum2[wlen-1-m] = wcsum2[wlen-2-m] - wc[m];   // sum of wc[0...m]
	}       // 1..wl2-1 with m

	for (k=0; k<n2; k++) {
	    // where j is close to the "left" end
	    for (j=0; j<wl2s; j++) {
		for (m=-j+wl2s; m<wlen; m++) {
		    wcm = wc[m] / wcsum1[wl2s-1-j];
		    for (i=0; i<n0; i++) {
			zo[ID3(i,j,k)] += wcm*zi[ID3(i,j-wl2s+m,k)];
		    }                                //^^^^^^^^ from 0
		}
	    }
	    // where grids are enough on both sides
	    for (j=wl2s; j<n1-wl2; j++) {
		for (m=0; m<wlen; m++) {
		    for (i=0; i<n0; i++) {
			zo[ID3(i,j,k)] += wc[m]*zi[ID3(i,j-wl2s+m,k)];
		    }
		}
	    }
	    // where j is close to the "right" end
	    for (j=n1-wl2; j<n1; j++) {
		for (m=0; m<n1-j+wl2s; m++) {
		    wcm = wc[m] / wcsum2[j-n1+wl2];  // wcsum2: 0...wl2
		    for (i=0; i<n0; i++) {
			zo[ID3(i,j,k)] += wcm*zi[ID3(i,j-wl2s+m,k)];
		    }                                //^^^^^^^^ upto n1-1
		}
	    }
	}
	break;
    case BC_CYCLIC:
	for (k=0; k<n2; k++) {
	    for (j=0; j<n1; j++) {
		for (m=0; m<wlen; m++) {
		    jj = j-wl2s+m;
		    if ( jj < 0 ) {
			jj += n1;
		    } else {
			jj %= n1;
		    }
		    for (i=0; i<n0; i++) {
			zo[ID3(i,j,k)] += wc[m]*zi[ID3(i,jj,k)];
		    }
		}
	    }
	}
	break;
    case BC_TRIM:
	// trim where extra grids are not enough
	for (k=0; k<n2; k++) {
	    for (j=0; j<n1-wlen+1; j++) {
		for (m=0; m<wlen; m++) {
		    for (i=0; i<n0; i++) {
			zo[ID3T(i,j,k)] += wc[m]*zi[ID3(i,j+m,k)];
		    }
		}
	    }
	}
	break;
    default:
	rb_raise(rb_eArgError,"Undefined boundary condision (%d)", ibc);
	break;
    }
}


static void 
running_mean_miss(zi,n0,n1,n2, w,wlen, ibc, zmiss,nminvalid, zo)
    // IN
    double *zi;
    na_shape_t n0;
    na_shape_t n1;
    na_shape_t n2;
    double *w;
    na_shape_t wlen;
    int ibc;
    double zmiss;
    int nminvalid;
    // OUT
    double *zo;   //  must have been set to 0 everywhere
{
    na_shape_t i,j,k,m, jj;
    na_shape_t wl2s, wl2;
    double wsum;
    int nm;

    wl2s = (wlen - 1)/2;  // e.g. 7->3, 6->2
    wl2 = wlen/2;         // e.g. 7->3, 6->3;   wl2s + wl2 == wlen - 1

    // < weighted running mean >

    switch (ibc) {
    case BC_SIMPLE:
	for (k=0; k<n2; k++) {
	    // where j is close to the "left" end
	    for (j=0; j<wl2s; j++) {
		for (i=0; i<n0; i++) {
		    wsum = 0.0;
		    nm = 0;
		    for (m=-j+wl2s; m<wlen; m++) {
			if ( zi[ID3(i,j-wl2s+m,k)] != zmiss ) {
			    zo[ID3(i,j,k)] += w[m]*zi[ID3(i,j-wl2s+m,k)];
                                                          //^^^^^^^^ from 0
			    wsum += w[m];
			    nm++;
			}
		    }
		    if (nm >= nminvalid) {
			zo[ID3(i,j,k)] /= wsum;
		    } else {
			zo[ID3(i,j,k)] = zmiss;
		    }
		}
	    }
	    // where grids are enough on both sides
	    for (j=wl2s; j<n1-wl2; j++) {
		for (i=0; i<n0; i++) {
		    wsum = 0.0;
		    nm = 0;
		    for (m=0; m<wlen; m++) {
			if ( zi[ID3(i,j-wl2s+m,k)] != zmiss ) {
			    zo[ID3(i,j,k)] += w[m]*zi[ID3(i,j-wl2s+m,k)];
			    wsum += w[m];
			    nm++;
			}
		    }
		    if (nm >= nminvalid) {
			zo[ID3(i,j,k)] /= wsum;
		    } else {
			zo[ID3(i,j,k)] = zmiss;
		    }
		}
	    }
	    // where j is close to the "right" end
	    for (j=n1-wl2; j<n1; j++) {
		for (i=0; i<n0; i++) {
		    wsum = 0.0;
		    nm = 0;
		    for (m=0; m<n1-j+wl2s; m++) {
			if ( zi[ID3(i,j-wl2s+m,k)] != zmiss ) {
			    zo[ID3(i,j,k)] += w[m]*zi[ID3(i,j-wl2s+m,k)];
		                                         //^^^^^^^^ upto n1-1
			    wsum += w[m];
			    nm++;
			}
		    }
		    if (nm >= nminvalid) {
			zo[ID3(i,j,k)] /= wsum;
		    } else {
			zo[ID3(i,j,k)] = zmiss;
		    }
		}
	    }
	}
	break;
    case BC_CYCLIC:
	for (k=0; k<n2; k++) {
	    for (j=0; j<n1; j++) {
		for (i=0; i<n0; i++) {
		    wsum = 0.0;
		    nm = 0;
		    for (m=0; m<wlen; m++) {
			jj = j-wl2s+m;
			if ( jj < 0 ) {
			    jj += n1;
			} else {
			    jj %= n1;
			}
			if ( zi[ID3(i,jj,k)] != zmiss ) {
			    zo[ID3(i,j,k)] += w[m]*zi[ID3(i,jj,k)];
			    wsum += w[m];
			    nm++;
			}
		    }
		    if (nm >= nminvalid) {
			zo[ID3(i,j,k)] /= wsum;
		    } else {
			zo[ID3(i,j,k)] = zmiss;
		    }
		}
	    }
	}
	break;
    case BC_TRIM:
	// trim where extra grids are not enough
	for (k=0; k<n2; k++) {
	    for (j=0; j<n1-wlen+1; j++) {
		for (i=0; i<n0; i++) {
		    wsum = 0.0;
		    nm = 0;
		    for (m=0; m<wlen; m++) {
			if ( zi[ID3(i,j+m,k)] != zmiss ) {
			    zo[ID3T(i,j,k)] += w[m]*zi[ID3(i,j+m,k)];
			    wsum += w[m];
			    nm++;
			}
		    }
		    if (nm >= nminvalid) {
			zo[ID3T(i,j,k)] /= wsum;
		    } else {
			zo[ID3T(i,j,k)] = zmiss;
		    }
		}
	    }
	}
	break;
    default:
	rb_raise(rb_eArgError,"Undefined boundary condision (%d)", ibc);
	break;
    }
}

// 注：convolution も引数は同じなのでドライバーは兼ねられる
// (入り口はわけないとならない)が今のところやってない．
// 
static VALUE
running_mean(int argc, VALUE *argv, VALUE self)
{
    VALUE vi;    // mandatory 1st arg; input NArray
    VALUE dim;   // mandatory 2nd arg; Integer 
    VALUE wgt;   // mandatory 3rd arg; weight 1D NArray
    VALUE bc;    // mandatory 4th arg; boundary condition (Integer class const)
    VALUE missv=Qnil; // optional 5th arg; if present(Float) vi may have missing
    int nminvalid=1;     // optional 6th arg; miminum count of non-missing vales
    int with_miss;
    struct NARRAY *na;
    na_shape_t *shi, wlen;
    na_shape_t n0, n1, n2;
    double *zi, *w, zmiss;
    int d, i, rank, ibc;
    VALUE vo;
    na_shape_t *sho, n1o;
    double *zo;

    // < process arguments >

    if ( argc<4 || argc>6 ) { rb_raise(rb_eArgError,"Need 4 to 6 arguments"); }
    vi  = argv[0];
    dim = argv[1];
    wgt = argv[2];
    bc  = argv[3];
    with_miss = (argc > 4) && (!NIL_P(argv[4]));
    if ( with_miss ) { missv = argv[4]; }
    if ( argc == 6) {
	nminvalid = NUM2INT( argv[5] );
    }

    //   1st arg
    if (!IsNArray(vi)) { rb_raise(rb_eArgError,"1st arg must be a NArray"); }
    vi = na_cast_object(vi, NA_DFLOAT);
    rank = NA_RANK(vi);
    zi = NA_PTR_TYPE(vi, double *);
    GetNArray(vi, na);
    shi = na->shape;

    //   2nd arg
    d = NUM2INT( dim );

    //   3rd arg
    if (!IsNArray(wgt)) {rb_raise(rb_eArgError,"3rd arg must be a 1D NArray");}
    if (NA_RANK(wgt)!=1) {rb_raise(rb_eArgError,"3rd arg must be a 1D NArray");}
    wgt = na_cast_object(wgt, NA_DFLOAT);
    wlen = NA_TOTAL(wgt);
    w = NA_PTR_TYPE(wgt, double *);

    //   4th arg
    ibc = NUM2INT( bc );

    //   5th arg
    if ( with_miss ) { zmiss = NUM2DBL(missv); }

    //   6th arg
    if (nminvalid > wlen) {rb_raise(rb_eArgError,"nminvalid > filtering length");}

    // < shape as 3D >

    n1 = shi[d];   // length of the dim to filter
    if (wlen >= n1) {rb_raise(rb_eArgError,"filter len >= len of the dim");}

    n0 = n2 = 1;
    for (i=0; i<d; i++) {
	n0 *= shi[i];   // total lengths of dims before d
    }
    for (i=d+1; i<rank; i++) {
	n2 *= shi[i];   // total lengths of dims after d
    }

    // < initialize the NArray to ruturn >

    n1o = convol_result_length(n1, wlen, ibc);

    sho = ALLOCA_N(int, rank);
    for(i=0; i<rank; i++){
	sho[i] = ( i!=d ? shi[i] : n1o );
    }

    vo = na_make_object(NA_DFLOAT, rank, sho, cNArray);
    GetNArray(vo, na);
    na_clear_data(na);
    zo = NA_PTR_TYPE(vo, double *);

    // < do the job >
    if ( with_miss ) {
	running_mean_miss(zi,n0,n1,n2, w,wlen, ibc, zmiss, nminvalid, zo);
    } else {
	running_mean_nomiss(zi,n0,n1,n2, w,wlen, ibc, zo);
    }

    return vo;
}

/* Running tile (2D) mean for data with missing.
   (if no missing, you can just run the 1D running mean twice.)
 */
static void 
running_mean_2D_miss(zi, ni,nd1,no1,nd2,no2,
                     with_w1, w1, wlen1, ibc1, with_w2, w2, wlen2, ibc2,
                     zmiss, nminvalid, zo)
    // IN
    double *zi;
    na_shape_t ni,nd1,no1,nd2,no2;  /* treated as 5D array (inner => outer) */
    int with_w1, with_w2;
    double *w1, *w2;
    na_shape_t wlen1, wlen2;
    int ibc1, ibc2;
    double zmiss;
    int nminvalid;
    // OUT
    double *zo;   //  must have been set to 0 everywhere
{
    na_shape_t i,j1,k1,j2,k2,m1,m2, jj1, jj2;
    na_shape_t mstr1, mend1, mstr2, mend2;
    na_shape_t wlhs1, wlhs2;
    double wsum, wv;
    int nm;

    wlhs1 = (wlen1 - 1)/2;  // e.g. 7->3, 6->2
    wlhs2 = (wlen2 - 1)/2;  // e.g. 7->3, 6->2

    // < weighted running mean >

    if ( !( ibc1 == BC_SIMPLE || ibc1 == BC_CYCLIC ) ) {
	rb_raise(rb_eArgError,"Undefined boundary condision(1st D) (%d)", ibc1);
    }
    if ( !( ibc2 == BC_SIMPLE || ibc2 == BC_CYCLIC ) ) {
	rb_raise(rb_eArgError,"Undefined boundary condision(2nd D) (%d)", ibc2);
    }

    for (k2=0; k2<no2; k2++) {
        for (j2=0; j2<nd2; j2++) {
            for (k1=0; k1<no1; k1++) {
                for (j1=0; j1<nd1; j1++) {
                    for (i=0; i<ni; i++) {
                        wsum = 0.0;
                        nm = 0;
                        if (ibc1==BC_SIMPLE) {
                            mstr1 = NMax(0,wlhs1-j1);
                            mend1 = NMin(wlen1,nd1+wlhs1-j1);
                        } else {  /* BC_CYCLIC */
                            mstr1 = 0;
                            mend1 = wlen1;
                        }
                        if (ibc2==BC_SIMPLE) {
                            mstr2 = NMax(0,wlhs2-j2);
                            mend2 = NMin(wlen2,nd2+wlhs2-j2);
                        } else {  /* BC_CYCLIC */
                            mstr2 = 0;
                            mend2 = wlen2;
                        }
                        for (m2=mstr2; m2<mend2; m2++) {
                            for (m1=mstr1; m1<mend1; m1++) {
                                jj2 = j2-wlhs2+m2;
                                jj1 = j1-wlhs1+m1;
                                if (ibc1==BC_CYCLIC) {
                                    if ( jj1 < 0 ) {
                                        jj1 += nd1;
                                    } else {
                                        jj1 %= nd1;
                                    }
                                }
                                if (ibc2==BC_CYCLIC) {
                                    if ( jj2 < 0 ) {
                                        jj2 += nd2;
                                    } else {
                                        jj2 %= nd2;
                                    }
                                }
                                if ( zi[ID5(i,jj1,k1,jj2,k2)] != zmiss ) {
                                    wv = 1.0;
                                    if(with_w1){ wv *= w1[m1]; }
                                    if(with_w2){ wv *= w2[m2]; }
                                    zo[ID5(i,j1,k1,j2,k2)] +=
                                        wv*zi[ID5(i,jj1,k1,jj2,k2)];
                                    wsum += wv;
                                    nm++;
                                }
                            }
                        }
                        if (nm >= nminvalid) {
                            zo[ID5(i,j1,k1,j2,k2)] /= wsum;
                        } else {
                            zo[ID5(i,j1,k1,j2,k2)] = zmiss;
                        }
                    }
                }
            }
        }
    }
}

// driver of running_mean_2D_miss
// (no-missing version is not needed)
// 
static VALUE
running_mean_2D(obj, vi, dim1, len_or_wgt1, bc1, dim2, len_or_wgt2, bc2,
                missv, nminvalid)
    VALUE obj;
    VALUE vi;     // input NArray
    VALUE dim1;   // Integer 
    VALUE len_or_wgt1;   // weight 1D NArray or Integer (length of uniform wgt)
    VALUE bc1 ;   // boundary condition (Integer)
    VALUE dim2;   // Integer 
    VALUE len_or_wgt2;   // weight 1D NArray or Integer (length of uniform wgt)
    VALUE bc2 ;   // boundary condition (Integer)
    VALUE missv;  // missing value in vi
    VALUE nminvalid;  // miminum count of non-missing vales
{
    struct NARRAY *na;
    VALUE wgt;
    na_shape_t *shi, wlen1, wlen2;
    int with_w1, with_w2;
    na_shape_t ni,nd1,no1,nd2,no2;
    double *zi, *w1, *w2, zmiss;
    int d1, d2, i, rank, ibc1, ibc2;
    VALUE vo;
    na_shape_t *sho, n1o;
    int nminvalid_;  // miminum count of non-missing vales
    double *zo;

    // < process arguments >

    if (!IsNArray(vi)) { rb_raise(rb_eArgError,"1st arg must be a NArray"); }
    vi = na_cast_object(vi, NA_DFLOAT);
    rank = NA_RANK(vi);
    if (rank < 2) { rb_raise(rb_eArgError,"rank 1st arg must be >= 2"); }
    zi = NA_PTR_TYPE(vi, double *);
    GetNArray(vi, na);
    shi = na->shape;

    d1 = NUM2INT( dim1 );
    d2 = NUM2INT( dim2 );
    nminvalid_ = NUM2INT( nminvalid );

    if (d1 >= d2) { rb_raise(rb_eArgError,"d1 < d2 is required"); }
    if (d2 >= rank) { rb_raise(rb_eArgError,"d2 must be < rank"); }

    with_w1 = IsNArray(len_or_wgt1);
    if (with_w1)  {
        wgt = len_or_wgt1;
        if (NA_RANK(wgt)!=1){rb_raise(rb_eArgError,"wgt1 must be a 1D NArray");}
        wgt = na_cast_object(wgt, NA_DFLOAT);
        wlen1 = NA_TOTAL(wgt);
        w1 = NA_PTR_TYPE(wgt, double *);
    } else {
        wlen1 = NUM2INT( len_or_wgt1 );
        w1 == NULL; /* will not be used */
    }

    with_w2 = IsNArray(len_or_wgt2);
    if (with_w2)  {
        wgt = len_or_wgt2;
        if (NA_RANK(wgt)!=1){rb_raise(rb_eArgError,"wgt2 must be a 1D NArray");}
        wgt = na_cast_object(wgt, NA_DFLOAT);
        wlen2 = NA_TOTAL(wgt);
        w2 = NA_PTR_TYPE(wgt, double *);
    } else {
        wlen2 = NUM2INT( len_or_wgt2 );
        w2 == NULL; /* will not be used */
    }

    ibc1 = NUM2INT( bc1 );
    ibc2 = NUM2INT( bc2 );

    zmiss = NUM2DBL(missv);


    // < shape as 5D >

    nd1 = shi[d1];   // length of the dim to filter
    if (wlen1 >= nd1) {rb_raise(rb_eArgError,"filter len >= len of the dim1");}
    nd2 = shi[d2];   // length of the dim to filter
    if (wlen2 >= nd1) {rb_raise(rb_eArgError,"filter len >= len of the dim2");}

    ni = no1 = no2 = 1;
    for (i=0; i<d1; i++) {
	ni *= shi[i];   // total lengths of dims before d
    }
    for (i=d1+1; i<d2; i++) {
	no1 *= shi[i];   // total lengths of dims after d
    }
    for (i=d2+1; i<rank; i++) {
	no2 *= shi[i];   // total lengths of dims after d
    }

    // < initialize the NArray to ruturn >

    sho = shi; // Limited to shape conserved cases
    vo = na_make_object(NA_DFLOAT, rank, sho, cNArray);
    GetNArray(vo, na);
    na_clear_data(na);
    zo = NA_PTR_TYPE(vo, double *);

    // < do the job >
    running_mean_2D_miss(zi, ni,nd1,no1,nd2,no2,
                         with_w1, w1, wlen1, ibc1, with_w2, w2, wlen2, ibc2,
                         zmiss, nminvalid_, zo);

    return vo;
}

static void 
bin_mean_nomiss(zi,n0,n1,n2, len, zo)
    // IN
    double *zi;
    na_shape_t n0;
    na_shape_t n1;
    na_shape_t n2;
    na_shape_t len;
    // OUT
    double *zo;   //  must have been set to 0 everywhere
{
    na_shape_t n1o;
    na_shape_t i,j,k,m;
    double fact;

    n1o = n1 / len;
    fact = 1.0 / len;

    for (k=0; k<n2; k++) {
	for (j=0; j<n1o; j++) {
	    for (m=0; m<len; m++) {
		for (i=0; i<n0; i++) {
		    zo[ID3o(i,j,k)] += fact*zi[ID3(i,j*len+m,k)];
		}
	    }
	}
    }
}

static void 
bin_sum_nomiss(zi,n0,n1,n2, len, zo)
    // IN
    double *zi;
    na_shape_t n0;
    na_shape_t n1;
    na_shape_t n2;
    na_shape_t len;
    // OUT
    double *zo;   //  must have been set to 0 everywhere
{
    na_shape_t n1o;
    na_shape_t i,j,k,m;

    n1o = n1 / len;

    for (k=0; k<n2; k++) {
	for (j=0; j<n1o; j++) {
	    for (m=0; m<len; m++) {
		for (i=0; i<n0; i++) {
		    zo[ID3o(i,j,k)] += zi[ID3(i,j*len+m,k)];
		}
	    }
	}
    }
}

static void 
bin_mean_miss(zi,n0,n1,n2, len, zmiss,nminvalid, zo)
    // IN
    double *zi;
    na_shape_t n0;
    na_shape_t n1;
    na_shape_t n2;
    na_shape_t len;
    double zmiss;
    int nminvalid;
    // OUT
    double *zo;   //  must have been set to 0 everywhere
{
    na_shape_t n1o;
    na_shape_t i,j,k,m;
    na_shape_t cnt;

    n1o = n1 / len;

    for (k=0; k<n2; k++) {
	for (j=0; j<n1o; j++) {
	    for (i=0; i<n0; i++) {
		cnt = 0;
		for (m=0; m<len; m++) {
		    if ( zi[ID3(i,j*len+m,k)] != zmiss ) {
			zo[ID3o(i,j,k)] += zi[ID3(i,j*len+m,k)];
			cnt++;
		    }
		}
		if (cnt >= nminvalid) {
		    zo[ID3o(i,j,k)] /= cnt;
		} else {
		    zo[ID3o(i,j,k)] = zmiss;
		}
	    }
	}
    }
}

static void 
bin_sum_miss(zi,n0,n1,n2, len, zmiss,nminvalid, zo)
    // IN
    double *zi;
    na_shape_t n0;
    na_shape_t n1;
    na_shape_t n2;
    na_shape_t len;
    double zmiss;
    int nminvalid;
    // OUT
    double *zo;   //  must have been set to 0 everywhere
{
    na_shape_t n1o;
    na_shape_t i,j,k,m;
    na_shape_t cnt;

    n1o = n1 / len;

    for (k=0; k<n2; k++) {
	for (j=0; j<n1o; j++) {
	    for (i=0; i<n0; i++) {
		cnt = 0;
		for (m=0; m<len; m++) {
		    if ( zi[ID3(i,j*len+m,k)] != zmiss ) {
			zo[ID3o(i,j,k)] += zi[ID3(i,j*len+m,k)];
			cnt++;
		    }
		}
		if ( !(cnt >= nminvalid) ) {
		    zo[ID3o(i,j,k)] = zmiss;
		}
	    }
	}
    }
}

// bin_mean or bin_sum depeiding on the last arg
static VALUE
bin_mean_sum(int argc, VALUE *argv, VALUE self, int mean)
{
    VALUE vi;   // 1st arg; input NArray
    VALUE dim;  // 2nd arg
    na_shape_t len;   // 3rd arg
    VALUE missv=Qnil; // optional 4th arg; if present(Float) vi may have missing
    int nminvalid=1;  // optional 5th arg

    int with_miss;
    struct NARRAY *na;
    double *zi, zmiss;
    na_shape_t *shi;
    na_shape_t n0, n1, n2;
    int d, i, rank;
    VALUE vo;
    na_shape_t *sho, n1o;
    double *zo;

    // < process arguments >

    if ( argc<3 || argc>5 ) { rb_raise(rb_eArgError,"Need 2 or 3 arguments"); }
    vi  = argv[0];

    dim = argv[1];
    d = NUM2INT( dim );

    len  = NUM2INT( argv[2] );
    if (len<1) {rb_raise(rb_eArgError,"len must be >= 1");}
    
    with_miss = (argc > 3) && (!NIL_P(argv[3]));
    if ( with_miss ) { 
	missv = argv[3];
	zmiss = NUM2DBL(missv);
    }

    if (argc==5) { 
	nminvalid  = NUM2INT( argv[4] );
	if (nminvalid > len) {rb_raise(rb_eArgError,"nminvalid > bin length");}
    }


    // < NArray elements >

    vi = na_cast_object(vi, NA_DFLOAT);
    rank = NA_RANK(vi);
    zi = NA_PTR_TYPE(vi, double *);
    GetNArray(vi, na);
    shi = na->shape;

    // < shape as 3D >

    n1 = shi[d];   // length of the dim
    if (len >= n1) {rb_raise(rb_eArgError,"filter len >= len of the dim");}

    n0 = n2 = 1;
    for (i=0; i<d; i++) {
	n0 *= shi[i];   // total lengths of dims before d
    }
    for (i=d+1; i<rank; i++) {
	n2 *= shi[i];   // total lengths of dims after d
    }

    // < initialize the NArray to ruturn >

    n1o = n1 / len;   // <- BC_TRIM (currently this is the only available bc)

    sho = ALLOCA_N(int, rank);
    for(i=0; i<rank; i++){
	sho[i] = ( i!=d ? shi[i] : n1o );
    }

    vo = na_make_object(NA_DFLOAT, rank, sho, cNArray);
    GetNArray(vo, na);
    na_clear_data(na);
    zo = NA_PTR_TYPE(vo, double *);

    // < do the job >
    if ( mean )  {
	if ( with_miss ) {
	    bin_mean_miss(zi,n0,n1,n2, len, zmiss,nminvalid, zo);
	} else {
	    bin_mean_nomiss(zi,n0,n1,n2, len, zo);
	}
    } else {
	if ( with_miss ) {
	    bin_sum_miss(zi,n0,n1,n2, len, zmiss,nminvalid, zo);
	} else {
	    bin_sum_nomiss(zi,n0,n1,n2, len, zo);
	}
    }

    return vo;
}

static VALUE
bin_mean(int argc, VALUE *argv, VALUE self)
{
    return bin_mean_sum(argc, argv, self, 1);
}

static VALUE
bin_sum(int argc, VALUE *argv, VALUE self)
{
    return bin_mean_sum(argc, argv, self, 0);
}


/* 
   cum_sum_dfloat_bang : cumulative summation along a dimension zdim (FLOAT)
   (bang method: overwrite the input by the result)
 */
static VALUE
cum_sum_dfloat_bang(obj, f, zdim)
     VALUE obj;
     VALUE f;
     VALUE zdim;
{
    int rank, zd, d;
    struct NARRAY *na;
    na_shape_t *shape;
    double *v;
    na_shape_t n0, nz, n2;
    na_shape_t j, k, l;

    if ( NA_TYPE(f) != NA_DFLOAT ){
        rb_raise(rb_eArgError, "expects a DFLOAT NArray");
    }

    rank = NA_RANK(f);
    GetNArray(f, na);
    shape = na->shape;
    v = (double *)NA_PTR(na, 0);

    zd = NUM2INT(zdim);
    if (zd < 0) zd += rank;    // negative: count from the last dim

    if (zd < 0 || zd >= rank){
        rb_raise(rb_eArgError, 
	   "Invalid dimension (%d) for a rank %d NArray", NUM2INT(zdim), rank);
    }

    for (d=0, n0=1 ; d<zd ; d++){
	n0 *= shape[d];
    }
    nz = shape[zd];
    for (d=zd+1, n2=1 ; d<rank ; d++){
	n2 *= shape[d];
    }

    for (l=0; l<n2; l++){
	for (k=1; k<nz; k++){
	    for (j=0; j<n0; j++){
		v[ID3z(j,k,l)] += v[ID3z(j,k-1,l)];
	    }
	}
    }

    return Qnil;
}

/* 
   cum_sum : cumulative summation along a dimension zdim (FLOAT)

   This method may be good to extend NArray.
   At this moment (2014-11-30), though, it is made available to Ruby
   only as a private method of VArray.
 */
static VALUE
cum_sum(obj, f, zdim)
     VALUE obj;
     VALUE f;
     VALUE zdim;
{
    VALUE sum;

    sum = na_clone(f);
    switch( NA_TYPE(f) ) {
    case NA_DFLOAT:
	cum_sum_dfloat_bang(obj, sum, zdim);
	break;
    default:
        rb_raise(rb_eArgError, "Sorry, this type of NArray is yet to be supported");
	break;
    }
    return sum;
}


/*
cell_integ_irreg: trapezoidal numerical integration over coordinate cells, supporting irregular grid

== Description

Suppose a multi-dimensional NArray f[:,k,:], where colon represents
any number of dimensions, and k is the "z" dimension along which
integration is made. We write its real space representation as
f(z; x), where x symbolically represents all of the independent
variables other than z, and for simplicity, we further write it as
f(z). 

z is sampled at z_k, k=0,1,...,nzbound-1. This method allows z_k to be
defined for each z column, so it requires a multi-D NArray argument
z[:,k,:] (having the same shape as f). Optionally, nzbound can also
vary as nzbound[:,:]. If, instead, nil is given to nzbound, the entire
z grid is used; nzbound is set to f.shape(zdim).

We define the integration of f as

             {    \int_za^zb f(z) dz,  when za<=zb,
  I(za,zb) = {
             {   -\int_za^zb f(z) dz,  otherwise.

In other words, our integration is always made from the smaller end
to the greater end.

In the normal use case (when w is given nil), we define the cell
integration as,

  I(-\infty, zc_0), I(zc_0, zc_1), I(zc_1, zc_2),...,

The cell boundaries zc_m (m=0,1,..) are specified by the 1D NArray
argument "ccell"; ccell must be aligned in the increasing order.

This method allows coordinate transformation by specifying another
coordinate variable w[:,k,:] (having the same shape as f). In this 
case, the ccell argument specifies a coordinate with respect to w:
wc_m (m=0,1,...; wc_m must be in the increasing order). 
The integration is still taken with respect to z, so the cell
integration is expressed as

  I(-\infty, z(wc_0)), I(z(wc_0), z(wc_1)), I(z(wc_1), z(wc_2)),...,

The grid values z[:,k,:] and w[:,k,:] do not have to be monotonic;
the numerical integration properly treats the contribution from
multiple ranges along k. Mathematically, the coordinate-transferred
integration over the w bin (-\infty, wc] is expressed as

  \int_-\infty^+\infty H(wc-w(z)) f(z) dz,

where H is the Heaviside function. The normal use case (without w) 
is simply when w is z itself, which is exploited in implementation.

 */
static VALUE
cell_integ_irreg(obj, f, mask, z, zdim, nzbound, ccell, w)
     VALUE obj;
     VALUE f;     // [NArray] multi-D data to be integrated
     VALUE mask;  // [nil, or NArray] if NArray, mask of f
     VALUE z;     // [NArray] 1D or multi-D coordinate values of f's grid
                  // (if 1D, its length must be f's along zdim;
                  // if multi-D z.shape must be equal to f.shape).
                  // integration is always along z (whether or not w is given)
     VALUE zdim;  // [Integer] dimension of f along which to integrate
     VALUE nzbound; // [nil, or NArray(integer) with rank 1 smaller than f's]
                    // Length of the actual z dim for each column (data must
                    // be packed from the beginning of z dim; 0...nzbound[j,l]).
                    // If nil, the entire column is assumed valid;
     VALUE ccell; // [NArray] 1D grid to sample the result. 
                  // (if multi-D, its shape except along zdim must be f's);
                  // It is the z coordinate if w is nil;
                  // if w is given, it's a w grid.
     VALUE w;     // [nil or NArray] alternative grid point values to express
                  // the result as a function of w rather than z (special case)
                  // (e.g. z: pressure/g; w: potential temperature)
{
    struct NARRAY *na;
    int rank, zd, d, with_mask;
    na_shape_t *shape;
    na_shape_t n0, nz, n2, nzw;
    na_shape_t n0z, n2z, n0w, n2w, n0c, n2c;
    na_shape_t *oshape, nc;
    VALUE F;       // the result: \int f dz
    double *fv, *zv, *wc, *wv, *Fv;
    int32_t *nzbd;
    u_int8_t *msk;
    na_shape_t j, k, l, m;     // k: index of z (orig);  m: index of ccell
    double fa, fb, dz, wa, wb, wac, wbc, a, b, fi;

    // cast to ensure pointer types

    if(!IsNArray(f)) rb_raise(rb_eArgError, "f is not a NArray");
    if(mask!=Qnil && !IsNArray(mask)) rb_raise(rb_eArgError, "mask is must be nil or a NArray");
    if(!IsNArray(z)) rb_raise(rb_eArgError, "z is not a NArray");
    if(!IsNArray(ccell)) rb_raise(rb_eArgError, "ccell is not a NArray");
    if(w!=Qnil && !IsNArray(w)) rb_raise(rb_eArgError, "w is must be nil or a NArray");

    f = na_cast_object(f, NA_DFLOAT);
    z = na_cast_object(z, NA_DFLOAT);
    if(nzbound != Qnil){ nzbound = na_cast_object(nzbound, NA_LINT); }
    ccell = na_cast_object(ccell, NA_DFLOAT);
    if(w != Qnil){ w = na_cast_object(w, NA_DFLOAT); }

    // read & check the shapes

    rank = NA_RANK(f);
    if ( nzbound != Qnil && NA_RANK(nzbound) != rank-1 ){
        rb_raise(rb_eArgError, "rank of nzbound must be 1 smaller than f's");
    }

    zd = NUM2INT(zdim);
    if (zd < 0) zd += rank;    // negative: count from the last dim

    if (zd < 0 || zd >= rank){
        rb_raise(rb_eArgError, 
		"Invalid dimension (%d) since f.rank==%d", NUM2INT(zdim), rank);
    }

    GetNArray(f, na);
    shape = na->shape; 
    for (d=0, n0=1 ; d<zd ; d++){
	n0 *= shape[d];
    }
    nz = shape[zd];
    for (d=zd+1, n2=1 ; d<rank ; d++){
	n2 *= shape[d];
    }

    if(mask==Qnil) {
        with_mask = 0;
        msk = NULL;
    } else {
        with_mask = 1;
        if (NA_TOTAL(mask) != n0*nz*n2){
            rb_raise(rb_eArgError, "lengths of f and mask must agree");
        }
        GetNArray(mask, na);
        msk = (u_int8_t *)NA_PTR(na, 0);
    }

    if ( NA_RANK(z) == rank ){
        if (NA_TOTAL(z) != n0*nz*n2){
            rb_raise(rb_eArgError, "lengths of f and z must agree (if multiD)");
        }
        n0z = n0;
        n2z = n2;
    } else if (NA_RANK(z) == 1) {
        if (NA_TOTAL(z) != nz){
            rb_raise(rb_eArgError, "lengths of z must be nz (if 1D)");
        }
        n0z = 1;
        n2z = 1;
    } else {
        rb_raise(rb_eArgError, "z must have the same rank with f or be 1D");
    }

    if (nzbound != Qnil && NA_TOTAL(nzbound) != n0*n2){
	rb_raise(rb_eArgError, "shapes of f and nzbound are incompatible");
    }

    // prepare the output array; assign pointers

    if ( NA_RANK(ccell) == 1 ){
        nc = NA_TOTAL(ccell);
        n0c = 1;
        n2c = 1;
    } else if (NA_RANK(ccell) == rank) {
        GetNArray(ccell, na);
        shape = na->shape;
        for (d=0, n0c=1 ; d<zd ; d++){
            n0c *= shape[d];
        }
        nc = shape[zd];
        for (d=zd+1, n2c=1 ; d<rank ; d++){
            n2c *= shape[d];
        }
        if (n0c != n0) rb_raise(rb_eArgError,
                                "shape miss match btwn f and ccell (case 0)");
        if (n2c != n2) rb_raise(rb_eArgError,
                                "shape miss match btwn f and ccell (case 2)");
    } else {
        rb_raise(rb_eArgError, "ccell is must be 1D or f.rank");
    }
    oshape = ALLOCA_N(na_shape_t, rank);
    for (d=0; d<rank ; d++){
	if (d != zd){
	    oshape[d] = shape[d];
	} else {
	    oshape[d] = nc;
	}
    }
    F = na_make_object(NA_DFLOAT, rank, oshape, cNArray);
    GetNArray(F, na);
    na_clear_data(na);
    Fv = (double *)NA_PTR(na, 0);

    GetNArray(f, na);
    fv = (double *)NA_PTR(na, 0);

    GetNArray(z, na);
    zv = (double *)NA_PTR(na, 0);

    if(nzbound != Qnil){
	GetNArray(nzbound, na);
	nzbd = (int32_t *)NA_PTR(na, 0);
    }

    GetNArray(ccell, na);
    wc = (double *)NA_PTR(na, 0);
    if (wc[ID3e(0,0,0,n0c,nc,n2c)] > wc[ID3e(0,nc-1,0,n0c,nc,n2c)]){
	rb_raise(rb_eArgError, "ccell must be alined in the increasing order");
    }

    if(w != Qnil){ 
	GetNArray(w, na);
	wv = (double *)NA_PTR(na, 0);
        if ( NA_RANK(w) == rank ){
            if (NA_TOTAL(w) != n0*nz*n2){
                rb_raise(rb_eArgError, "lengths of f and w must agree (if multiD)");
            }
            n0w = n0;
            n2w = n2;
        } else if (NA_RANK(w) == 1) {
            if (NA_TOTAL(w) != nz){
                rb_raise(rb_eArgError, "lengths of z must be nz (if 1D)");
            }
            n0w = 1;
            n2w = 1;
        } else {
            rb_raise(rb_eArgError, "z must have the same rank with f or be 1D");
        }
    } else {
	wv = zv;
        n0w = n0z;
        n2w = n2z;
    }

    // main loop
    #pragma omp parallel for private(nzw,m,k,wa,wb,fa,fb,dz,wac,wbc,a,b,fi)
    for (l=0; l<n2; l++){
        #pragma omp parallel for private(nzw,m,k,wa,wb,fa,fb,dz,wac,wbc,a,b,fi)
	for (j=0; j<n0; j++){
	    if (nzbound == Qnil) {
		nzw = nz;
	    } else {
		nzw = nzbd[j+n0*l];
	        if(nzw>nz) nzw=nz; 
	    }
	    m=0;   // m: index of the new cooridnate
	    for (k=0; k<nzw-1; k++){
		// set the trapezoid
		if (wv[ID3e(j,k,l,n0w,nz,n2w)] < wv[ID3e(j,k+1,l,n0w,nz,n2w)]) {
		    wa = wv[ID3e(j,k,l,n0w,nz,n2w)];   // left/lower end along w
		    wb = wv[ID3e(j,k+1,l,n0w,nz,n2w)]; //right/upper end along w
		    fa = fv[ID3z(j,k,l)];    // at the left end
		    fb = fv[ID3z(j,k+1,l)];  // at the right end
		} else {
		    wa = wv[ID3e(j,k+1,l,n0w,nz,n2w)]; // left/lower end along w
		    wb = wv[ID3e(j,k,l,n0w,nz,n2w)];   //right/upper end along w
		    fa = fv[ID3z(j,k+1,l)];  // at the left end
		    fb = fv[ID3z(j,k,l)];    // at the right end
		}
                if ( with_mask && (!msk[ID3z(j,k,l)] || !msk[ID3z(j,k+1,l)]) ) {
                    fa = fb = 0.0;
                }
		dz = fabs(zv[ID3e(j,k+1,l,n0z,nz,n2z)]-zv[ID3e(j,k,l,n0z,nz,n2z)]);

		// find the right next grid point to wa (left end)
		if (wa < wc[ID3e(j,m,l,n0c,nc,n2c)]){
		    while( m>0 && wa < wc[ID3e(j,m-1,l,n0c,nc,n2c)] ){ m--; }
		} else {
		    while( wa >= wc[ID3e(j,m,l,n0c,nc,n2c)] && m<nc ){ m++; }
		    // m can be nc, meaning the entire trapezoid is outside 
		}

		// integration
		if (m<nc){
		    wac = wa;   // left end of the current bin
		    while(1){
			//wbc = (wb<=wc[m]) ? wb : wc[m];  // current right end
			wbc = fmin(wb, wc[ID3e(j,m,l,n0c,nc,n2c)]);

			// do the integration
			if (wb != wa){
			    a = (wac-wa)/(wb-wa);  // normalized (0-1) left end
			    b = (wbc-wa)/(wb-wa);  // normalized (0-1) right end
			    fi = (fa + (a+b)/2*(fb-fa)) * (b-a)*dz;
			} else {
			    fi = 0.0;
			}
			  // ^ f value at (a+b)/2     ^ width
			Fv[ID3c(j,m,l)] += fi;
			if ( wb <= wc[ID3e(j,m,l,n0c,nc,n2c)] || m == nc-1 ) break;
			wac = wc[ID3e(j,m,l,n0c,nc,n2c)];
			m++;
		    }
		} else {
		    // no need integrate
		    m = nc - 1;    // put m inside the range before next k
		}
	    }
	}
    }

    // finish
    return F;
}

/* 
   cum_integ_irreg : similar to cell_integ_irreg but it sums up along the
   axis. -- This method acutually uses cell_integ_irreg and make sumation.
 */
static VALUE
cum_integ_irreg(obj, f, mask, z, zdim, nzbound, ccell, w)
     VALUE obj;
     VALUE mask;  // [nil, or NArray] if NArray, mask of f
     VALUE f;
     VALUE z;
     VALUE zdim;
     VALUE nzbound;
     VALUE ccell;
     VALUE w;
{
    VALUE F;

    F = cell_integ_irreg(obj, f, mask, z, zdim, nzbound, ccell, w);
    cum_sum_dfloat_bang(obj, F, zdim);
    return F;
}

/* 
   cap_by_boundary : Cap (insert) a NArray with boundary values

   Restriction; data alignment is restricted so that the beginning of
   the out data is always valid (within the domain). To ensure it, it
   should be either zcrd is increasing and upper==true or zcrd is
   decreasing and upper==false.

   RETURN VALUES

   * fe: f capped by the boundary values. The dimension zdim is 
     extended by 1; i.e., f[:,nz,:] --> fe[:,nz+1,:], where ":" respresent
     arbitrary number of dimensions. The elements of fe are equal to 
     those of f where inside the domain (simple copies), and they are equal
     to the elements of fs at the bondary (simple copies if fs is given;
     if not, guessed by interpolation or naive extension).
   * ze: grid points of fe along zdim. It is a mixture of zcrd and zs;
     it is zcrd inside the domain (where f is copied), and it is zs
     at the boundary (where fs is copied).
     Same shape as fe. 
   * nze: The number of valid data along zdim of fe. Shaped as ze[:,:],
     according to the notation above. For example, when fe is 4D and
     zdim==2, fe[i,j,k,l] is valid for k = 0,1,...,nze[i,j,l]-1,
     where the boundary is at nze[i,j,l]-1. Thus, nze is always
     smaller than or equal to the length of zdim of fe (which is nz+1)

 */
static VALUE
cap_by_boundary(obj, f, zdim, zcrd, upper, zb, fb, misval)
     VALUE obj;
     VALUE f;     // [NArray] multi-D data
     VALUE zdim;  // [Integer] dimension of zcrd in f
     VALUE zcrd;  // [NArray] 1D coordinate values of the zdim dimension of f
     VALUE upper; // true/false to cap the upper/lower side (with z)
     VALUE zb;    // [NArray] the "surface" z; zb.rank must be f.rank-1
     VALUE fb;    // [nil or NArray] the f value at surface (zb.shape==fb.shape)
     VALUE misval;    // [nil or Float] if Float, missing value of f
{
    VALUE fe;  // [NArray] return value, extended f by fb
    VALUE ze;  // [NArray] return value, grid points of fe along zdim
    VALUE nze;  // [NArray] return value, valid data lengths along zdim in fe
    VALUE result; // [Array] [fe, ze, nze] (return the two in an Array)
    struct NARRAY *na;
    int rank, zd, d, w_mis, mis;
    na_shape_t n0, nz, n2, nc;
    double *fv, *zcv, *zbv, *fbv, *fev, *zev;
    double ze_filv, rmis;
    int32_t *nzev;
    int zcincr, capupper, sgn;
    na_shape_t j, k, l;
    na_shape_t *shape, *oshape, *oshape2;

    // cast to ensure pointer types

    if(!IsNArray(f)) rb_raise(rb_eArgError, "f is not a NArray");
    if(!IsNArray(zcrd)) rb_raise(rb_eArgError, "zcrd is not a NArray");
    if(!IsNArray(zb)) rb_raise(rb_eArgError, "zb is not a NArray");
    if(fb!=Qnil && !IsNArray(fb)) rb_raise(rb_eArgError, "fb must be nil or a NArray");

    f = na_cast_object(f, NA_DFLOAT);
    zcrd = na_cast_object(zcrd, NA_DFLOAT);
    zb = na_cast_object(zb, NA_DFLOAT);
    if(fb != Qnil){ fb = na_cast_object(fb, NA_DFLOAT); }
    w_mis = (misval != Qnil);
    if (w_mis) {
        rmis = NUM2DBL(misval);
    }

    // read & check

    rank = NA_RANK(f);
    if (NA_RANK(zb)!=rank-1){rb_raise(rb_eArgError,"zb.rank must be f.rank-1");}

    zd = NUM2INT(zdim);
    if (zd < 0) zd += rank;    // negative: count from the last dim
    if (zd < 0 || zd >= rank){
        rb_raise(rb_eArgError, 
		"Invalid dimension (%d) since f.rank==%d", NUM2INT(zdim), rank);
    }

    GetNArray(f, na);
    shape = na->shape; 
    for (d=0, n0=1 ; d<zd ; d++){
	n0 *= shape[d];
    }
    nz = shape[zd];
    for (d=zd+1, n2=1 ; d<rank ; d++){
	n2 *= shape[d];
    }
    fv = (double *)NA_PTR(na, 0);

    if (NA_TOTAL(zcrd) != nz){
	rb_raise(rb_eArgError,"zcrd.length (%d) != nz (%d)", NA_TOTAL(zcrd),nz);
    }
    zcv = NA_PTR_TYPE(zcrd, double *);

    if (NA_TOTAL(zb) != n0*n2){
	rb_raise(rb_eArgError,"shapes of f and zb are incompatible");
    }
    zbv = NA_PTR_TYPE(zb, double *);

    if(fb != Qnil){ 
	if (NA_TOTAL(fb) != n0*n2){
	    rb_raise(rb_eArgError,"shapes of f and fb are incompatible");
	}
	fbv = NA_PTR_TYPE(fb, double *);
    }

    // find the direction

    zcincr = zcv[0] < zcv[nz-1];            // zcrd is in the increasing order
    capupper = upper != Qnil && upper != Qfalse;  // whether upper is "true"
    sgn = capupper ? 1 : -1;
    if ( (zcincr && !capupper) || (!zcincr && capupper) ){
	rb_raise(rb_eArgError,"Unexpected data alignment: To ensure that the zdim dimension of the output NArray (fe, ze) starts from the valid data (i.e., its beginning is inside the domain), it should be either that zcrd is increasing and upper==true or that zcrd is decreasing and upper==false");
    }

    if (capupper) {
        ze_filv = 1e200;
    } else {
        ze_filv = -1e200;
    }
    // prepare the output array

    nc = nz+1;
    oshape = ALLOCA_N(na_shape_t, rank);
    for (d=0; d<rank ; d++){
	if (d != zd){
	    oshape[d] = shape[d];
	} else {
	    oshape[d] = nc;
	}
    }
    fe = na_make_object(NA_DFLOAT, rank, oshape, cNArray);
    fev = NA_PTR_TYPE(fe, double *);
    ze = na_make_object(NA_DFLOAT, rank, oshape, cNArray);
    zev = NA_PTR_TYPE(ze, double *);

    oshape2 = ALLOCA_N(na_shape_t, rank-1);
    for (d=0; d<rank ; d++){
	if (d < zd){
	    oshape2[d] = shape[d];
	} else if (d > zd) {
	    oshape2[d-1] = shape[d];
	}
    }

    nze = na_make_object(NA_LINT, rank-1, oshape2, cNArray);
    nzev = NA_PTR_TYPE(nze, int32_t *);

    // initialize the output data

    for (l=0; l<n2; l++){
	for (k=0; k<nz; k++){
	    for (j=0; j<n0; j++){
		fev[ID3c(j,k,l)] = fv[ID3z(j,k,l)];  // copy
		zev[ID3c(j,k,l)] = zcv[k];           // copy
	    }
	}
	for (j=0; j<n0; j++){
	    fev[ID3c(j,nc-1,l)] = 0e0;   // clear
	    zev[ID3c(j,nc-1,l)] = ze_filv;      // clear
	}
    }

    // main loop
    if(fb != Qnil){ 
	for (l=0; l<n2; l++){
	    for (j=0; j<n0; j++){
		for (k=0; k<nz; k++){
                    mis = (w_mis && fv[ID3z(j,k,l)] == rmis);
		    if ( mis || (zcv[k] - zbv[ID2(j,l)])*sgn > 0 ){
			fev[ID3c(j,k,l)] = fbv[ID2(j,l)];
			zev[ID3c(j,k,l)] = zbv[ID2(j,l)];
			nzev[ID2(j,l)] = k+1;
			break;
		    }
		}
		if (k==nz) {
		    // didn't break
		    fev[ID3c(j,k,l)] = fbv[ID2(j,l)];
		    zev[ID3c(j,k,l)] = zbv[ID2(j,l)];
		    nzev[ID2(j,l)] = k+1;
		}
	    }
	}
    } else {
	for (l=0; l<n2; l++){
	    for (j=0; j<n0; j++){
		for (k=0; k<nz; k++){
                    mis = (w_mis && fv[ID3z(j,k,l)] == rmis);
		    if ( mis || (zcv[k] - zbv[ID2(j,l)])*sgn > 0 ){
			fev[ID3c(j,k,l)] = fv[ID3z(j,k-1,l)];//naive extension
			//fev[ID3c(j,k,l)] = 
			//    ( fv[ID3z(j,k-1,l)]*(zcv[k]-zbv[ID2(j,l)])
			//    + fv[ID3z(j,k,l)]*(zbv[ID2(j,l)]-zcv[k-1]) ) /
			//    (zcv[k] - zcv[k-1]);
			zev[ID3c(j,k,l)] = zbv[ID2(j,l)];
			nzev[ID2(j,l)] = k+1;
			break;
		    }
		}
		if (k==nz) {
		    // didn't break
		    fev[ID3c(j,k,l)] = fv[ID3z(j,k-1,l)];  // naive extension
		    zev[ID3c(j,k,l)] = zbv[ID2(j,l)];
		    nzev[ID2(j,l)] = k+1;
		}
	    }
	}
    }

    // output
    result = rb_ary_new3(3, fe, ze, nze);
    return result;
}

/* 
   c_val_crossing (C extension): find the positions or the values of y where the values of z crosses zval along dim (using the linear interpolation)

   ARGUMENT
   * z [NArray of float or sfloat]
   * zmask [nil or NArray byte]
   * dim [Integer]
   * zval [Numeric]
   * y [nil or NArray of float or sfloat] if nil, dim indices are used
   * nth [Integer] 1,2,3,... for the 1st, 2nd, 3rd,... crossing
   * descend [true, false, or nil] true: inspect dim indices from the last
   * incr [nil, true, or false] nil: any crsossing is detected; true:
     only increasing crossing is detected; false: only decreasing crossing is ..

   RETURN VALUE
   * yc [NArray double or float] : value of y at the crossing
   * ycmask [NArray byte]: mask Array to create NArrayMiss from yc & ycmask
 */
static VALUE
val_crossing(obj, z, zmask, dim, zval, y, nth, descend, incr)
     VALUE obj;
     VALUE z;
     VALUE zmask;
     VALUE dim;
     VALUE zval;
     VALUE y;
     VALUE nth;
     VALUE descend;
     VALUE incr;
{
    VALUE yc;      // return value 1 (NArray)
    VALUE ycmask;  // return value 2 (NArray byte)
    double *yc_d;
    float *yc_f;
    u_int8_t *ycmsk;
    int rank, dm, d, y1d;
    struct NARRAY *na;
    na_shape_t *shape, *oshape;
    double *z_d, *y_d, zcv, z1, z2;
    float *z_f, *y_f;
    int zmis;
    u_int8_t *zmsk;
    na_shape_t n0, n1, n2;
    na_shape_t j, k, l;
    double a; // fractional position: 0<=a<=1
    na_shape_t kf, ke, kstp;
    int zdbl, ydbl;
    int ith, nthv;
    int icross, dcross;
    double fillv = 9.9692099683868690e+36; // from NetCDF

    if (!IsNArray(z)) {rb_raise(rb_eArgError, "z must be a NArray");}
    if (y!= Qnil && !IsNArray(y)) {
        rb_raise(rb_eArgError, "y must be nill or a NArray");
    }

    rank = NA_RANK(z);
    GetNArray(z, na);
    shape = na->shape;

    if ( NA_TYPE(z) == NA_DFLOAT) {
        zdbl = 1; // true
        z_d = (double *)NA_PTR(na, 0);
    } else if (NA_TYPE(z) == NA_SFLOAT) {
        zdbl = 0; // false
        z_f = (float *)NA_PTR(na, 0);
    } else {
        rb_raise(rb_eArgError, "z must be a DFLOAT or SFLOAT NArray");
    }

    if (zmask == Qnil) {
        zmis = 0; // no need to consider data missing;
    } else {
        if (!IsNArray(zmask)) {rb_raise(rb_eArgError, "zmask is not a NArray");}
        zmis = 1; // data missing may exist
        GetNArray(zmask, na);
        zmsk = (u_int8_t *)NA_PTR(na, 0);
    }
    

    dm = NUM2INT(dim);
    if (dm < 0) dm += rank;    // negative: count from the last dim

    if (dm < 0 || dm >= rank){
        rb_raise(rb_eArgError, 
	   "Invalid dimension (%d) for a rank %d NArray", NUM2INT(dim), rank);
    }

    zcv = NUM2DBL(zval);

    oshape = ALLOCA_N(na_shape_t, rank-1);
    for (d=0, n0=1 ; d<dm ; d++){
	n0 *= shape[d];
        oshape[d] = shape[d];
    }
    n1 = shape[dm];
    for (d=dm+1, n2=1 ; d<rank ; d++){
	n2 *= shape[d];
        oshape[d-1] = shape[d];
    }

    if ( descend == Qtrue ){
        kf = n1-1;
        ke = -1;
        kstp = -1;
    } else {
        kf = 0;
        ke = n1;
        kstp = 1;
    }
    
    if ( incr == Qnil ){
        icross = 1;
        dcross = 1;
    } else if ( incr == Qtrue ){
        icross = 1;
        dcross = 0;
    } else {
        icross = 0;
        dcross = 1;
    }

    nthv = NUM2INT(nth);
    if (nthv <= 0) {
        rb_raise(rb_eArgError, "nth (%d) must be positive (1,2,...)", nthv);
    }

    if ( y == Qnil ) {
        ydbl = 1; // true
        y1d = 1; // true
        y_d = ALLOCA_N(double, n1);
        for (k=0; k<n1; k++) {y_d[k] = (double) k;}
    } else {
        GetNArray(y, na);
        if ( NA_TYPE(y) == NA_DFLOAT) {
            ydbl = 1; // true
            y_d = (double *)NA_PTR(na, 0);
        } else if (NA_TYPE(y) == NA_SFLOAT) {
            ydbl = 0; // false
            y_f = (float *)NA_PTR(na, 0);
        } else {
            rb_raise(rb_eArgError, "expects a DFLOAT or SFLOAT NArray");
        }
        if (NA_TOTAL(y) == n1) {
            // treated as 1D along dim
            y1d = 1; // true
        } else if (NA_TOTAL(y) == NA_TOTAL(z)) {
            // treated as y and z share a shape
            y1d = 0; // false
        } else {
            rb_raise(rb_eArgError, "Shape (length) of y mismatches that of z");
        }
    }

    if (ydbl) {
        yc = na_make_object(NA_DFLOAT, rank-1, oshape, cNArray);
        yc_d = NA_PTR_TYPE(yc, double *);
        for (j=0; j<n0*n2; j++){yc_d[j] = fillv;}
    } else {
        yc = na_make_object(NA_SFLOAT, rank-1, oshape, cNArray);
        yc_f = NA_PTR_TYPE(yc, float *);
        for (j=0; j<n0*n2; j++){yc_f[j] = (float) fillv;}
    }
    ycmask = na_make_object(NA_BYTE, rank-1, oshape, cNArray);
    ycmsk = NA_PTR_TYPE(ycmask, u_int8_t *);
    for (j=0; j<n0*n2; j++){ycmsk[j] = 0;}

    for (l=0; l<n2; l++){
        for (j=0; j<n0; j++){
            z1 = ( zdbl ? z_d[ID3(j,kf,l)] : z_f[ID3(j,kf,l)] );
            ith = 0;
            for (k=kf+kstp; k!=ke; k+=kstp) {
                z2 = ( zdbl ? z_d[ID3(j,k,l)] : z_f[ID3(j,k,l)] );
                if ( ( !zmis || (zmsk[ID3(j,k-kstp,l)] && zmsk[ID3(j,k,l)]) ) &&
                     ( icross && z1<zcv && zcv<=z2 ||
                       dcross && z1>zcv && zcv>=z2 ) ){
                    ith++;
                    if (ith == nthv){
                        a = (zcv-z1) / (z2-z1);
                        if (y1d) {
                            if (ydbl) {
                                yc_d[ID2(j,l)] = (1.0-a)*y_d[k-kstp] + a*y_d[k];
                            } else {
                                yc_f[ID2(j,l)] = (1.0-a)*y_f[k-kstp] + a*y_f[k];
                            }
                        } else {
                            if (ydbl) {
                                yc_d[ID2(j,l)] = (1.0-a)*y_d[ID3(j,k-kstp,l)]
                                                     + a*y_d[ID3(j,k,l)];
                            } else {
                                yc_f[ID2(j,l)] = (1.0-a)*y_f[ID3(j,k-kstp,l)]
                                                     + a*y_f[ID3(j,k,l)];
                            }
                        }
                        ycmsk[ID2(j,l)] = 1;
                        break;
                    }
                }
                z1 = z2;
            }
        }
    }
    return rb_ary_new3(2, yc, ycmask);
}

void
init_gphys_dim_op()
{
    static VALUE mNumRu;
    static VALUE cGPhys;
    static VALUE cVArray;

    // rb_require("narray");  // it does not work ??
    mNumRu = rb_define_module("NumRu");

    cGPhys = rb_define_class_under(mNumRu, "GPhys", rb_cObject);
    rb_define_private_method(cGPhys, "c_running_mean", running_mean, -1);
    rb_define_private_method(cGPhys, "c_running_mean_2D", running_mean_2D, 9);
    rb_define_singleton_method(cGPhys, "c_running_mean", running_mean, -1);
    rb_define_singleton_method(cGPhys, "c_running_mean_2D", running_mean_2D, 9);

    cVArray = rb_define_class_under(mNumRu, "VArray", rb_cObject);
    rb_define_private_method(cVArray, "c_bin_mean", bin_mean, -1);
    rb_define_private_method(cVArray, "c_bin_sum", bin_sum, -1);
    rb_define_singleton_method(cGPhys, "c_bin_mean", bin_mean, -1);
    rb_define_singleton_method(cGPhys, "c_bin_sum", bin_sum, -1);

    rb_define_private_method(cVArray, "c_cum_sum", cum_sum, 2);
    rb_define_singleton_method(cGPhys, "c_cum_sum", cum_sum, 2);
    rb_define_singleton_method(cGPhys, "c_cell_integ_irreg", 
			       cell_integ_irreg, 7);
    rb_define_singleton_method(cGPhys, "c_cum_integ_irreg", cum_integ_irreg, 7);

    rb_define_singleton_method(cGPhys, "c_cap_by_boundary", cap_by_boundary, 7);
    rb_define_singleton_method(cGPhys, "c_val_crossing", val_crossing, 8);
}
