From 2aa5b81ea4dfed498afb99c638faad4658ab9556 Mon Sep 17 00:00:00 2001
From: Runhai Ouyang <rouyang@shu.edu.cn>
Date: Sat, 24 Aug 2024 12:19:21 +0800
Subject: [PATCH] Add files via upload

---
 src/DI.f90         | 1132 ++++++++++++++++++++++++++
 src/FC.f90         | 1680 +++++++++++++++++++++++++++++++++++++++
 src/FCse.f90       | 1894 ++++++++++++++++++++++++++++++++++++++++++++
 src/SISSO.f90      |  578 ++++++++++++++
 src/libsisso.f90   | 1842 ++++++++++++++++++++++++++++++++++++++++++
 src/var_global.f90 |   46 ++
 6 files changed, 7172 insertions(+)
 create mode 100644 src/DI.f90
 create mode 100644 src/FC.f90
 create mode 100644 src/FCse.f90
 create mode 100644 src/SISSO.f90
 create mode 100644 src/libsisso.f90
 create mode 100644 src/var_global.f90

diff --git a/src/DI.f90 b/src/DI.f90
new file mode 100644
index 0000000..d78f852
--- /dev/null
+++ b/src/DI.f90
@@ -0,0 +1,1132 @@
+! Licensed under the Apache License, Version 2.0 (the "License");
+! you may not use this file except in compliance with the License.
+! You may obtain a copy of the License at
+!
+!     http://www.apache.org/licenses/LICENSE-2.0
+!
+! Unless required by applicable law or agreed to in writing, software
+! distributed under the License is distributed on an "AS IS" BASIS,
+! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+! See the License for the specific language governing permissions and
+! limitations under the License.
+
+
+module DI
+! descriptor identification
+!-------------------------------------------------------------
+use var_global
+use libsisso
+implicit none
+
+contains
+
+subroutine descriptor_identification
+real*8,allocatable:: xinput(:,:,:),yinput(:,:),beta(:,:),beta_init(:,:),coeff(:,:),xprime(:,:,:),&
+        xdprime(:,:,:),xmean(:,:),norm_xprime(:,:),yprime(:,:),prod_xty(:,:),prod_xtx(:,:,:),lassormse(:),&
+        ymean(:),intercept(:),weight(:,:)
+real*8  lambda,lambda_max,alpha,rmse_ave
+integer run_iter,i,j,k,l,ll,nf,maxns,ntry,nactive,mpii,mpij,mpin
+integer,allocatable:: idf(:),activeset(:),ncol(:)
+character line*500
+character,allocatable:: expr(:)*200
+logical isnew,dat_readerr
+
+if(mpirank==0) mytime.sDI=mpi_wtime()
+   maxns=maxval(nsample)
+
+!--------------------------
+allocate(xinput(maxns,nf_DI,ntask))
+allocate(yinput(maxns,ntask))
+allocate(expr(nf_DI))
+allocate(weight(maxns,ntask))
+allocate(activeset(nf_L0))
+
+!---------------------------------
+! job allocation to the CPU cores
+!---------------------------------
+allocate(ncol(mpisize))
+mpii=nf_DI/mpisize
+mpij=mod(nf_DI,mpisize)
+ncol(1)=mpii
+do mpin=1,mpisize-1
+if(mpin<=mpij) then
+ncol(1+mpin)=mpii+1
+else
+ncol(1+mpin)=mpii
+end if
+end do
+
+!-----------------------
+! data read in
+!-----------------------
+
+dat_readerr=.true.
+
+yinput=0
+xinput=0
+do i=1,ntask
+  if(ntask>1) then
+     write(line,'(a,i3.3,a)') 'Uspace_t',i,'.dat'
+  else 
+     write(line,'(a)') 'Uspace.dat'
+  end if
+  if(mpirank==0) then
+  open(fileunit,file='SIS_subspaces/'//trim(adjustl(line)),status='old')
+     if(ptype==1) then
+        do j=1,nsample(i)
+           read(fileunit,*,err=334) yinput(j,i),xinput(j,:,i)
+        end do
+     else
+        do j=1,nsample(i)
+           read(fileunit,*,err=334) xinput(j,:,i)
+           yinput(j,i)=0.d0  
+        end do
+     end if
+   close(fileunit)
+   end if
+   call mpi_bcast(yinput,maxns*ntask,mpi_double_precision,0,mpi_comm_world,mpierr)
+   call mpi_bcast(xinput,maxns*nf_DI*ntask,mpi_double_precision,0,mpi_comm_world,mpierr)
+end do
+
+dat_readerr=.false.
+334 if(dat_readerr) then
+       print *, 'Error while reading the file '//trim(adjustl(line)); stop
+    end if
+
+if(mpirank==0) then
+  open(fileunit,file='SIS_subspaces/Uspace.expressions',status='old')
+  
+  do i=1,nf_DI
+  read(fileunit,'(a)') line
+  line=trim(adjustl(line))
+  expr(i)=line(1:index(line,' '))
+  end do
+  close(fileunit)
+end if
+
+call mpi_bcast(expr,nf_DI*200,mpi_character,0,mpi_comm_world,mpierr)
+
+! argmin{sum(w_i*(y_i-bx_i)^2},e.g. https://en.wikipedia.org/wiki/Weighted_least_squares
+! one w_i corresponds to one (y_i-bx_i)
+weight=1.0
+
+! weighted lasso
+if(L1para.weighted) then
+open(fileunit,file='lasso.weight',status='old')
+read(fileunit,*) ! title line
+do i=1,ntask
+    do j=1,nsample(i)
+      read(fileunit,*) weight(j,i)
+    end do
+end do
+close(fileunit)
+end if
+
+
+if(nf_L0>nf_DI) then
+  print *, "Error: nf_L0 is greater than nf_DI ! "; stop
+end if
+
+!-----------------------------------------------------
+! MODEL SELECTION (L0 or L1L0)
+!-----------------------------------------------------
+if(trim(adjustl(method_so))=='L1L0' .and. nf_DI > nf_L0 .and. ptype==1) then
+    !----------------
+    ! L1 of the L1L0
+    ! see J. Friedman, T. Hastie, and R. Tibshirani 2010,"Regularization Paths for Generalized Linear Models via CD"
+    ! "J. Friedman, T. Hastie, and R. Tibshirani, 2010 "A note on the group lasso and a sparse group lasso"
+    !----------------
+    allocate(xprime(maxns,nf_DI,ntask))
+    allocate(xdprime(maxns,nf_DI,ntask))
+    allocate(xmean(nf_DI,ntask))
+    allocate(norm_xprime(nf_DI,ntask))
+    allocate(prod_xtx(nf_DI,ncol(mpirank+1),ntask))
+    allocate(intercept(ntask))
+    allocate(beta(nf_DI,ntask)) 
+    allocate(beta_init(nf_DI,ntask))
+    allocate(yprime(maxns,ntask))
+    allocate(prod_xty(nf_DI,ntask))
+    allocate(lassormse(ntask))
+    allocate(ymean(ntask))
+    
+    ! initialization for L1
+    xprime=0
+    xdprime=0
+    xmean=0
+    norm_xprime=0
+    prod_xtx=0
+    beta=0
+    beta_init=0.0
+    yprime=0
+    prod_xty=0
+    lassormse=0
+    ymean=0
+    
+    !-------------------------------------------------------------------------------------------------------
+    ! standardize and precompute the data for lasso
+    ! transform y=a+xb into y'=x''b'; 
+    ! y'=y-ymean; x'(:,j)=x(:,j)-xmean(j); x(:,j)''=x(:,j)'/nomr(x'(j)); 
+    ! a=ymean-xmean*b; b'=b*norm(x'); solving y'=x''b' to get b' and b, and then a
+    ! Finally we get y=a+bx
+    ! (observation) weighting is applied to square error, not the training property.
+    ! see "J. Friedman, T. Hastie, and R. Tibshirani, 2010 "A note on the group lasso and a sparse group lasso"
+    !--------------------------------------------------------------------------------------------------------
+    ! if(mpirank==0) write(9,'(a)') 'data standardizing ... '
+    !yprime
+    do i=1,ntask
+        ymean(i)=sum(yinput(:nsample(i),i))/nsample(i)
+        yprime(:nsample(i),i)=yinput(:nsample(i),i)-ymean(i)
+    end do
+    
+    do i=1,ntask
+       !xprime and xdprime
+         do j=1,nf_DI
+           xmean(j,i)=sum(xinput(:nsample(i),j,i))/nsample(i)
+           xprime(:nsample(i),j,i)=xinput(:nsample(i),j,i)-xmean(j,i)
+           norm_xprime(j,i)=sqrt(sum(xprime(:nsample(i),j,i)**2))
+           if(abs(norm_xprime(j,i))<1d-10) then
+           if(mpirank==0) print *, 'Error: Norm of the feature ',j,' of task ',i,' is zero'
+           stop
+           end if
+           xdprime(:nsample(i),j,i)=xprime(:nsample(i),j,i)/norm_xprime(j,i)
+         end do
+         !xty
+         prod_xty(:,i)=matmul(transpose(xdprime(:nsample(i),:,i)),weight(:nsample(i),i)*yprime(:nsample(i),i))
+         !xtx=(xtx)t !prod_xtx(mpii,:)=prod_xtx(:,mpii)
+         do mpii=1,ncol(mpirank+1)
+         prod_xtx(:,mpii,i)=matmul(xdprime(:nsample(i),mpii+sum(ncol(1:mpirank)),i)*weight(:nsample(i),i),&
+                                            xdprime(:nsample(i),:,i))
+         end do
+    end do
+    
+    !--------------------------------------------------------------------------------------------------------------
+    ! get lambda_max
+    !---------------------------------------------------------------------------------------------------------------
+    lambda_max=0
+    do j=1,nf_DI
+    lambda=sqrt(sum(prod_xty(j,:)**2))
+    if(lambda>lambda_max) lambda_max=lambda
+    end do
+    
+    if(mpirank==0) then
+    write(9,'(a)') 'Feature selection by multi-task lasso starts ... '
+    end if
+    
+    !----------------------------------------
+    ! (Elastic net) H. Zou, and T. Hastie, J. R. Statist. Soc. B 67, 301 (2005).
+    ! L1para.elastic=1 is back to lasso (Eq. 14)
+    !----------------------------------------
+    !if(L1para.elastic<1.0) then
+    !prod_xtx=L1para.elastic*prod_xtx
+    !do j=1,nf_DI
+    !prod_xtx(j,j,:)=prod_xtx(j,j,:)+(1-L1para.elastic)
+    !end do
+    !end if
+    
+    !---------------------------------------------------------------------------
+    ! Calculating lasso path with decreasing lambdas
+    !---------------------------------------------------------------------------
+    
+    !Size of the active set
+    nactive=0
+    ! iteration starts
+    do ntry=1,L1para.nlambda
+        ! create lambda sequence
+        ! max and min in log space: max=log10(lambda_max),min=log10(0.001*lambda_max)
+        ! density, i.e.: 100 points, interval=(max-min)/100=0.03
+        ! lambda_i=10^(max-0.03*(i-1))
+        lambda=10**(log10(lambda_max)-3.0/L1para.dens*(ntry-1))
+    
+        ! call mtlasso
+        call mtlasso_mpi(prod_xty,prod_xtx,lambda,L1para.max_iter,L1para.tole,beta_init,run_iter,beta,nf,ncol)
+        
+        ! lasso rmse
+        do i=1,ntask
+           lassormse(i)=sqrt(sum(weight(:nsample(i),i)*(yprime(:nsample(i),i)-matmul(xdprime(:nsample(i),:,i),&
+                       beta(:,i)))**2)/nsample(i))  ! RMSE
+        end do
+
+        if(task_weighting==1) then ! tasks treated equally
+           rmse_ave=sqrt(sum(lassormse**2)/ntask)  ! quadratic mean
+        else if(task_weighting==2) then ! tasks weighted by #sample_task_i/total_sample
+           rmse_ave=sqrt(sum(lassormse**2*nsample)/sum(nsample))
+        end if
+
+        ! L1para.warm_start
+        if(L1para.warm_start) beta_init=beta
+        
+        ! intercept and beta
+        do i=1,ntask
+          intercept(i)=ymean(i)-sum(xmean(:,i)*beta(:,i)/norm_xprime(:,i))
+        end do   
+                       
+        ! store the coeff. and id of features of this active set
+        allocate(coeff(nf,ntask))  ! nf: number of selected features
+        allocate(idf(nf))
+        coeff=0
+        idf=0
+        k=0
+        do j=1,nf_DI
+          if(maxval(abs(beta(j,:)))>1d-10) then
+           k=k+1
+           idf(k)=j
+           coeff(k,:)=beta(j,:)
+          end if
+        end do
+        if(k/=nf .and. mpirank==0 ) then
+          print *, 'LASSO: Selected features not correctly identified !'; stop
+        end if
+        
+        ! add to the total active set
+        isnew=.true.  
+        do k=1,nf
+          if(any(idf(k)==activeset(:nactive))) isnew=.false.
+          if(isnew) then
+             nactive=nactive+1
+             activeset(nactive)=idf(k)
+          end if
+          isnew=.true.
+          if(nactive==nf_L0) exit
+        end do
+        
+        ! output information of this lambda
+        if(mpirank==0) then
+        write(9,'(/)') 
+        write(9,'(a,i5)') 'Try: ',ntry
+        write(9,'(a)') '----------------'
+        write(9,'(a30,e20.10)')  'Lambda: ', lambda
+        write(9,'(a30,i10)')   'Number of iteration: ',run_iter
+2004    format(a30,*(f10.6))
+        write(9,2004)  ' LASSO RMSE averaged over tasks (quadratic mean): ', rmse_ave
+        write(9,2004)  ' LASSO RMSE for each task: ', lassormse
+        write(9,'(a30,i6)')   'Size of this active set: ',nf
+2005    format(a30,*(i8))
+        if(nf/=0) write(9,2005)   'This active set: ',idf
+        if(nf/=0) then
+2006    format(a25,i3.3,a2,*(e20.10))
+        do i=1,ntask
+        write(9,2006) 'LASSO Coeff._',i,': ', coeff(:,i)
+        end do
+        end if
+        write(9,'(a30,i6)') 'Size of the tot act. set: ',nactive
+2007    format(a30,*(i8))
+        if(nactive/=0) write(9,2007) 'Total active set: ',(activeset(j),j=1,nactive)
+        end if
+        deallocate(coeff)
+        deallocate(idf)
+        
+        ! exit
+        if(nactive>=nf_L0) then
+           if(mpirank==0) write(9,'(/a,i3)') 'Total number of selected features >= ',nf_L0
+           exit
+        else if (nactive==nf_DI) then
+           if(mpirank==0) write(9,'(/a)') 'The whole feature space is already selected!'
+           exit
+        else if (ntry==L1para.nlambda) then
+           if(mpirank==0) write(9,'(/a)') 'Number of lambda trials hits the max !'
+           exit
+        else if (maxval(lassormse)<L1para.minrmse) then
+           if(mpirank==0) write(9,'(/a)') 'Required accuracy reached !'
+           exit
+        end if
+    
+    end do
+    deallocate(prod_xty)
+    deallocate(prod_xtx)
+    deallocate(ncol)
+    deallocate(beta)
+    deallocate(beta_init)
+    deallocate(xprime)
+    deallocate(xdprime)
+    deallocate(xmean)
+    deallocate(yprime)
+    deallocate(intercept)
+    deallocate(norm_xprime)
+    deallocate(ymean)
+    deallocate(lassormse)
+    if(mpirank==0) write(9,'(a)') ' L1 finished ! '
+    !---------------
+    ! L0 of the L1L0
+    !---------------
+     call model(xinput,yinput,expr,nactive,activeset)
+ 
+else if (trim(adjustl(method_so))=='L0' .or. nf_DI==nf_L0) then
+    nactive=nf_L0
+    do i=1,nf_L0
+       activeset(i)=i
+    end do
+    if(ptype==1) then
+     call model(xinput,yinput,expr,nactive,activeset)
+    else if (ptype==2) then
+     call model2(xinput,expr,nactive,activeset)
+    end if
+end if
+
+
+!--------------------
+deallocate(xinput)
+deallocate(yinput)
+deallocate(expr)
+deallocate(activeset)
+deallocate(weight)
+
+call mpi_barrier(mpi_comm_world,mpierr)
+if(mpirank==0) then
+  mytime.eDI=mpi_wtime()
+  write(9,'(a,f15.2)') 'Time (second) used for this DI: ',mytime.eDI-mytime.sDI
+end if
+
+end subroutine
+
+
+subroutine model(x,y,expr,nactive,activeset)
+! model selection by L0 for regression
+integer nactive,activeset(:),i,j,k,l,loc(1),isc,ii(iFCDI),select_model(max(nmodel,1),iFCDI),&
+        mID(max(nmodel,1),iFCDI),sc_loc(1),mpii,mpij
+integer*8 totalm,bigm,nall,nrecord,nupper,nlower,njob(mpisize)
+real*8 x(:,:,:),y(:,:),tmp,tmp2,wrmse(ntask),rmse(ntask),sc_rmse(2**iFCDI,ntask),maxae(ntask),&
+sc_maxae(2**iFCDI,ntask),beta(iFCDI,ntask),intercept(ntask),select_rmse(max(nmodel,1)),&
+select_score(max(nmodel,1),2,1+ntask),select_maxae(max(nmodel,1)),&
+select_coeff(max(nmodel,1),iFCDI+1,ntask),mscore(max(nmodel,1),2,1+ntask),&
+select_metric(max(nmodel,1)),mcoeff(max(nmodel,1),iFCDI+1,ntask),sc_beta(iFCDI,2**iFCDI,ntask),&
+sc_intercept(2**iFCDI,ntask),sc_tmp(2**iFCDI),sc_tmp2(2**iFCDI),mpicollect(mpisize)
+character expr(:)*200,line_name*100
+logical isgood
+real   progress
+
+if(nmodel<1) nmodel=1
+
+   progress=0.2
+
+   ! job assignment for each CPU
+   nall=1
+   do mpii=1,iFCDI
+      nall=nall*(nactive-mpii+1)/mpii  ! total: C(N,M)
+   end do
+   if(nactive==iFCDI) nall=1
+   njob=nall/mpisize
+
+   mpij=mod(nall,int8(mpisize))
+   do mpii=1,mpisize-1
+     if(mpii<=mpij) njob(mpii+1)=njob(mpii+1)+1   
+   end do
+
+   ! initialization   
+   select_rmse=10*maxval(abs(y))
+   select_maxae=10*maxval(abs(y))
+   totalm=0
+
+! get the best nmodel models on each CPU core
+   nrecord=0
+   do k=1,nactive   ! features
+      ii(1)=k   ! d1
+
+      do j=2,iFCDI   ! d2, ..., d_iFCDI
+        ii(j)=ii(j-1)+1   ! d1=k, d2=k+1,d3=k+2,...
+      end do      
+
+      nlower=sum(njob(:mpirank))+1
+      nupper=sum(njob(:mpirank+1))
+      123 continue 
+      nrecord=nrecord+1
+      if( nrecord< nlower) goto 124   ! my job starts at nlower
+      if( nrecord> nupper) exit       ! my job ends at nupper
+
+      if(float(nrecord-nlower+1)/float(njob(mpirank+1))>=progress) then
+       do while( float(nrecord-nlower+1)/float(njob(mpirank+1))>= (progress+0.2) )
+              progress = progress+0.2
+       end do
+       write(*,'(a,i4,a,i4,a,f6.1,a)') 'dimension =',iFCDI,'   mpirank = ',mpirank,'   progress =',progress*100,'%'
+        progress=progress+0.2
+      end if
+
+      if ( trim(adjustl(metric))=='RMSE' .or. trim(adjustl(metric))=='MaxAE') then
+         do i=1,ntask
+            if(.not. scmt) then
+                 if(fit_intercept) then  ! linear fitting
+                    call orth_de(x(:nsample(i),[activeset(ii(:iFCDI))],i),y(:nsample(i),i),&
+                                 intercept(i),beta(:iFCDI,i),rmse(i))
+                 else
+                    call orth_de_nointercept(x(:nsample(i),[activeset(ii(:iFCDI))],i),y(:nsample(i),i),&
+                                             beta(:iFCDI,i),rmse(i))
+                    intercept(i)=0.d0
+                 end if
+                 if(isnan(rmse(i))) then   ! simply rmse=standard deviation
+                    rmse(i)=sqrt(sum((y(:nsample(i),i)-sum(y(:nsample(i),i))/nsample(i))**2)/nsample(i))
+                    intercept(i)=sum(y(:nsample(i),i))/nsample(i)
+                    beta(:iFCDI,i)=0.d0
+                 end if
+                 maxae(i)= maxval(abs(y(:nsample(i),i)-(intercept(i)+matmul(x(:nsample(i),&
+                                  [activeset(ii(:iFCDI))],i),beta(:iFCDI,i)))))
+            else  ! sign-constrained multi-task learning
+                 if(fit_intercept) then
+                    call sc_coord_descent(x(:nsample(i),[activeset(ii(:iFCDI))],i),y(:nsample(i),i),10000,1d-5,&
+                         iFCDI,sc_intercept(:2**iFCDI,i),sc_beta(:iFCDI,:2**iFCDI,i),sc_rmse(:2**iFCDI,i))
+                 else
+                    call sc_coord_descent_nointercept(x(:nsample(i),[activeset(ii(:iFCDI))],i),y(:nsample(i),i),&
+                         10000,1d-5,iFCDI,sc_beta(:iFCDI,:2**iFCDI,i),sc_rmse(:2**iFCDI,i))
+                    sc_intercept(:2**iFCDI,i)=0.d0
+                 end if
+                 if(isnan(rmse(i))) then
+                    sc_rmse(:,i)=sqrt(sum((y(:nsample(i),i)-sum(y(:nsample(i),i))/nsample(i))**2)/nsample(i))
+                    sc_intercept(:,i)=sum(y(:nsample(i),i))/nsample(i)
+                    sc_beta(:iFCDI,:,i)=0.d0
+                 end if
+                 do isc=1,2**iFCDI
+                    sc_maxae(isc,i)=maxval(abs(y(:nsample(i),i)-(sc_intercept(isc,i)+matmul(x(:nsample(i),&
+                                  [activeset(ii(:iFCDI))],i),sc_beta(:iFCDI,isc,i)))))
+                 end do
+            end if
+         end do
+
+         if( scmt ) then  ! best choice of coeff. signs
+             do isc=1,2**iFCDI
+                 if(task_weighting==1) then
+                     sc_tmp(isc)=sqrt(sum(sc_rmse(isc,:)**2)/ntask)
+                 else if(task_weighting==2) then
+                     sc_tmp(isc)=sqrt(sum((sc_rmse(isc,:)**2)*nsample)/sum(nsample))
+                 end if
+                 sc_tmp2(isc)=maxval(sc_maxae(isc,:))
+             end do
+             if( trim(adjustl(metric))=='RMSE' ) then
+                tmp=minval(sc_tmp)
+                sc_loc=minloc(sc_tmp)
+                tmp2=sc_tmp2(sc_loc(1))
+             elseif( trim(adjustl(metric))=='MaxAE' ) then
+                tmp2=minval(sc_tmp2)
+                sc_loc=minloc(sc_tmp2)
+                tmp=sc_tmp(sc_loc(1))
+             endif
+         else
+              if(task_weighting==1) then
+                 tmp=sqrt(sum(rmse**2)/ntask)  !overall error
+              else if(task_weighting==2) then
+                    tmp=sqrt(sum((rmse**2)*nsample)/sum(nsample))
+              end if
+              tmp2=maxval(maxae)        ! max(maxAE)
+         end if
+      end if
+    
+      isgood=.false.
+      if ( trim(adjustl(metric))=='RMSE' .and. any(tmp <select_rmse)) then
+         isgood = .true.
+         loc=maxloc(select_rmse)
+      else if ( trim(adjustl(metric))=='MaxAE' .and. any(tmp2 <select_maxae)) then
+         isgood = .true.
+         loc=maxloc(select_maxae)
+      end if
+
+      ! accept this model
+      if (isgood) then
+         totalm=totalm+1
+         select_rmse(loc(1))=tmp
+         select_maxae(loc(1))=tmp2
+         select_model(loc(1),:iFCDI)=activeset(ii(:iFCDI))
+         select_score(loc(1),1,1)=tmp
+         select_score(loc(1),2,1)=tmp2
+         do i=1,ntask
+             if( scmt ) then
+               select_coeff(loc(1),1,i)=sc_intercept(sc_loc(1),i)
+               select_coeff(loc(1),2:iFCDI+1,i)=sc_beta(:iFCDI,sc_loc(1),i)
+               select_score(loc(1),1,1+i)=sc_rmse(sc_loc(1),i)
+               select_score(loc(1),2,1+i)=sc_maxae(sc_loc(1),i)
+             else
+               select_coeff(loc(1),1,i)=intercept(i)
+               select_coeff(loc(1),2:iFCDI+1,i)=beta(:iFCDI,i)
+               select_score(loc(1),1,1+i)=rmse(i)
+               select_score(loc(1),2,1+i)=maxae(i)
+             end if
+         end do
+      end if
+
+      124 continue
+      ! update the index(e.g.: 123,124,125,134,135,145,234,...)
+      if(iFCDI==1) cycle  ! just go to increase the k
+      ii(iFCDI)=ii(iFCDI)+1  ! add 1 for the fastest index
+
+      ! when ii(j) hit max, ii(j-1) increase by 1, and ii(j) is reset to ii(j-1)+1
+      do j=iFCDI,3,-1  ! why 3? Just check what happen when iFCDI =1 and 2.
+        if(ii(j)> (nactive-(iFCDI-j)) ) ii(j-1)=ii(j-1)+1 ! reset ii(j-1) 
+      end do
+      do j=3,iFCDI  ! not needed for j <3
+        if(ii(j)> (nactive-iFCDI+j) ) ii(j)=ii(j-1)+1 ! reset ii(j)
+      end do
+
+      if(ii(2)>(nactive-(iFCDI-2)))  cycle  ! the second index hit max, one has to increase the k
+
+      goto 123
+   end do
+
+! collecting the best models
+   if(mpirank>0) then
+     call mpi_send(totalm,1,mpi_integer8,0,1,mpi_comm_world,status,mpierr)
+   else
+     do i=1,mpisize-1
+       call mpi_recv(bigm,1,mpi_integer8,i,1,mpi_comm_world,status,mpierr)
+       totalm=totalm+bigm
+     end do
+   end if
+   call mpi_bcast(totalm,1,mpi_integer8,0,mpi_comm_world,mpierr)
+
+   if(trim(adjustl(metric))=='RMSE') then
+     select_metric=select_rmse
+   else if (trim(adjustl(metric))=='MaxAE') then
+     select_metric=select_maxae     
+   end if
+
+   do j=1,min(int8(nmodel),totalm)  ! find the best models across all cores
+      loc=minloc(select_metric)
+      k=loc(1)
+      if(mpirank>0) then
+        call mpi_send(select_metric(loc(1)),1,mpi_double_precision,0,1,mpi_comm_world,status,mpierr)
+      else
+        mpicollect(1)=select_metric(loc(1))
+        do i=1,mpisize-1
+          call mpi_recv(mpicollect(i+1),1,mpi_double_precision,i,1,mpi_comm_world,status,mpierr)
+        end do
+        loc=minloc(mpicollect)
+      end if
+      call mpi_bcast(loc(1),1,mpi_integer,0,mpi_comm_world,mpierr)
+
+      if(mpirank==loc(1)-1) then
+        mID(j,:iFCDI)=select_model(k,:iFCDI)
+        mcoeff(j,:iFCDI+1,:)=select_coeff(k,:iFCDI+1,:)
+        mscore(j,:2,:)=select_score(k,:2,:)
+        select_metric(k)=10*maxval(abs(y))
+      end if
+      call mpi_bcast(mID(j,:iFCDI),iFCDI,mpi_integer,loc(1)-1,mpi_comm_world,mpierr)
+      call mpi_bcast(mscore(j,:2,:),2*(1+ntask),mpi_double_precision,loc(1)-1,mpi_comm_world,mpierr)
+      call mpi_bcast(mcoeff(j,:iFCDI+1,:),(iFCDI+1)*ntask,mpi_double_precision,loc(1)-1,mpi_comm_world,mpierr)
+   end do
+
+   ! Predictions from the best model to get the ypred
+   do i=1,ntask
+       do j=1,nsample(i)
+          ypred(sum(nsample(:i-1))+j)=mcoeff(1,1,i)+sum(x(j,[mID(1,:iFCDI)],i)*mcoeff(1,2:,i))
+       end do
+   end do
+
+   if(mpirank==0 .and. iFCDI==desc_dim) then !utput only the final dimension
+
+       call writeout(iFCDI,mID(1,:iFCDI),mscore(1,1,2:1+ntask),mscore(1,2,2:1+ntask),&
+                    mcoeff(1,2:,:),mcoeff(1,1,:),x,y,expr)
+
+      ! output the top models to the folder Models
+         write(line_name,'(a,i4.4,a,i3.3)') 'top',min(int8(nmodel),totalm),'_D',iFCDI
+         open(fileunit,file='Models/'//trim(adjustl(line_name)),status='replace')
+         write(fileunit,'(4a12)') 'Rank','RMSE','MaxAE','Feature_ID'
+2008     format(i12,2f12.6,a,*(i8))
+         do i=1,min(int8(nmodel),totalm)
+           write(fileunit,2008,advance='no') i,mscore(i,:,1),'  (',mID(i,:iFCDI)
+           write(fileunit,'(a)') ')'
+         end do
+         close(fileunit)
+
+         write(line_name,'(a,i4.4,a,i3.3,a)') 'top',min(int8(nmodel),totalm),'_D',iFCDI,'_coeff'
+         open(fileunit,file='Models/'//trim(adjustl(line_name)),status='replace')
+         write(fileunit,'(a)') 'Model_ID, [(c_i,i=0,n)_j,j=1,ntask]'
+20081    format(i12,*(e20.10))
+         do i=1,min(int8(nmodel),totalm)
+           write(fileunit,20081) i,(mcoeff(i,:iFCDI+1,j),j=1,ntask)
+         end do
+         close(fileunit)
+   end if
+
+call mpi_barrier(mpi_comm_world,mpierr)
+end subroutine
+
+
+subroutine writeout(iFCDI,id,rmse,maxae,betamin,interceptmin,x,y,expr)
+! output the information for regression
+integer iFCDI,i,j,id(:),k
+real*8 rmse(:),maxae(:),betamin(:,:),interceptmin(:),x(:,:,:),y(:,:),yfit
+character expr(:)*200,line_name*100
+
+ ! output the information to SISSO.out
+ write(9,'(a)') ' '
+ write(9,'(i3,a)') iFCDI,'D descriptor/model(y=sum(ci*di)+c0) : '
+ write(9,'(a)')'================================================================================'
+
+ do i=1,iFCDI
+ write(9,'(a,i3.3,3a,i6.6)') '     d',i,' = ',trim(adjustl(expr(id(i)))),'    feature_ID:',id(i)
+ end do
+ write(9,'(a)') ' '
+
+ 2009 format(a20,*(e20.10))
+ 2010 format(a20,a3,a2,*(e20.10))
+ 2011 format(a10,2a20,*(a19,i1.1))
+ 2012 format(i10,*(e20.10))
+ do i=1,ntask
+   if(ntask>1) then
+     write(9,'(a,i3.3)') '     task_',i
+     write(9,2009) '        coeff.(ci): ', betamin(:iFCDI,i)
+     write(9,'(a20,e20.10)')  '                c0: ', interceptmin(i)
+     write(9,'(a20,2e20.10)') '    RMSE and MaxAE: ', rmse(i),maxae(i)
+     write(line_name,'(a,i3.3,a,i3.3,a)') 'desc_D',iFCDI,'_t',i,'.dat'
+     open(fileunit,file='Models/data_top1/'//trim(adjustl(line_name)),status='replace')
+     write(fileunit,2011) 'Index','y_true','y_pred',('descriptor_',j,j=1,iFCDI)
+       do j=1,nsample(i)
+          yfit=interceptmin(i)+sum(x(j,[id(:iFCDI)],i)*betamin(:iFCDI,i))
+          write(fileunit,2012) j,y(j,i),yfit,x(j,[id(:iFCDI)],i)
+       end do
+     close(fileunit)
+    else
+     write(9,2009) '        coeff.(ci): ', betamin(:iFCDI,i)
+     write(9,'(a20,e20.10)')  '                c0: ', interceptmin(i)
+     write(9,'(a20,2e20.10)') '    RMSE and MaxAE: ', rmse(i),maxae(i)
+     write(line_name,'(a,i3.3,a)') 'desc_D',iFCDI,'.dat'
+     open(fileunit,file='Models/data_top1/'//trim(adjustl(line_name)),status='replace')
+     write(fileunit,2011) 'Index','y_true','y_pred',('descriptor_',j,j=1,iFCDI)
+       do j=1,nsample(i)
+          yfit=interceptmin(i)+sum(x(j,[id(:iFCDI)],i)*betamin(:iFCDI,i))
+          write(fileunit,2012) j,y(j,i),yfit,x(j,[id(:iFCDI)],i)
+       end do
+     close(fileunit)
+    end if
+ end do
+
+ if(ntask>1) then
+    if(task_weighting==1) then  ! tasks are treated equally
+       write(9,'(/a,2f10.6)')  '     Overall RMSE and MaxAE: ',sqrt(sum(rmse**2)/ntask),maxval(maxae)
+    else if(task_weighting==2) then  ! weighted by sample size
+       write(9,'(/a,2f10.6)')  '     Overall RMSE and MaxAE: ',sqrt(sum((rmse**2)*nsample)/sum(nsample)),maxval(maxae)
+    end if
+    if(task_weighting==1) then
+       write(9,'(a)') '     RMSE = sqrt(sum(RMSE_i^2)/ntask)   (for task_weighting=1)'   !sqrt(sum(rmse**2)/ntask)
+    else if(task_weighting==2) then
+       write(9,'(a)') '     RMSE = sqrt(sum(RMSE_i^2*n_i)/sum(n_i))   (for task_weighting=2)'
+                         !   sqrt(sum((rmse**2)*nsample)/sum(nsample))
+    end if
+    write(9,'(a)') '     MaxAE = max(MaxAE_i)'  !,maxval(maxae)
+ end if
+ write(9,'(a)')'================================================================================'
+
+end subroutine
+
+
+subroutine model2(x,expr,nactive,activeset)
+! model selection by L0 for classification
+integer nactive,activeset(:),i,j,k,l,loc(1),ii(iFCDI),itask,mm1,mm2,mm3,mm4,ns,select_model(max(nmodel,1),iFCDI),&
+mID(max(nmodel,1),iFCDI),overlap_n,overlap_n_tmp,select_overlap_n(max(nmodel,1)),nh,ntri,nconvexpair
+integer*8 totalm,bigm,nall,nrecord,nlower,nupper
+real*8  x(:,:,:),mscore(max(nmodel,1),2),overlap_size,overlap_size_tmp,select_overlap_size(max(nmodel,1)),&
+hull(ubound(x,1),2),area(maxval(ngroup(:,1000))),mindist,xtmp1(ubound(x,1),3),xtmp2(ubound(x,1),3)
+character expr(:)*200,line_name*100
+integer*8 njob(mpisize),mpii,mpij
+real*8 mpicollect(mpisize,2)
+logical isoverlap
+real progress
+integer,allocatable:: triangles(:,:)
+
+if(nmodel<1) nmodel=1
+
+  progress=0.2
+  if(iFCDI>2) stop 'Error: Current code supports only 1D and 2D in classification!'
+
+   ! job assignment for each CPU core
+   nall=1
+   do mpii=1,iFCDI
+      nall=nall*(nactive-mpii+1)/mpii
+   end do
+   if(nactive==iFCDI) nall=1
+   njob=nall/mpisize
+
+   mpij=mod(nall,int8(mpisize))
+   do mpii=1,mpisize-1
+     if(mpii<=mpij) njob(mpii+1)=njob(mpii+1)+1   
+   end do
+
+   ! initialization   
+   select_overlap_n=sum(nsample)*sum(ngroup(:,1000))  ! set to a large value
+   totalm=0
+
+! get the best nmodel models on each CPU core
+   nrecord=0
+   do k=1,nactive
+      ii(1)=k
+
+      do j=2,iFCDI
+        ii(j)=ii(j-1)+1   ! starting number
+      end do      
+
+      nlower=sum(njob(:mpirank))+1
+      nupper=sum(njob(:mpirank+1))
+      123 continue 
+      nrecord=nrecord+1
+
+      if( nrecord< nlower) goto 124
+      if( nrecord> nupper) exit
+
+      if(float(nrecord-nlower+1)/float(njob(mpirank+1))>=progress) then
+        do while( float(nrecord-nlower+1)/float(njob(mpirank+1))>= (progress+0.2))
+              progress = progress+0.2
+        end do
+        write(*,'(a,i4,a,i4,a,f6.1,a)') 'dimension =',iFCDI,'   mpirank =',mpirank,'   progress =',progress*100,'%'
+        progress=progress+0.2
+      end if
+
+      ! initialization
+      overlap_n=0
+      overlap_size=0.d0
+      mindist=-1d10
+      isoverlap=.false.
+      nconvexpair=0
+
+      !------ get the score for this model ----------
+      do itask=1,ntask
+        ! area
+           if(iFCDI==1) then
+             do i=1,ngroup(itask,1000) ! number of groups in the task $itask
+              if(isconvex(itask,i)==0) cycle
+              mm1=sum(ngroup(itask,:i-1))+1 
+              mm2=sum(ngroup(itask,:i))
+              area(i)=maxval(x(mm1:mm2,[activeset(ii(:iFCDI))],itask))-minval(x(mm1:mm2,[activeset(ii(:iFCDI))],itask))
+             end do
+           else if(iFCDI==2) then
+             do i=1,ngroup(itask,1000)
+              if(isconvex(itask,i)==0) cycle
+              mm1=sum(ngroup(itask,:i-1))+1 
+              mm2=sum(ngroup(itask,:i))
+              area(i)=convex2d_area(x(mm1:mm2,[activeset(ii(:iFCDI))],itask))
+             end do
+           else if(iFCDI==3) then
+             area(:ngroup(itask,1000))=1.d0  ! not yet implemented for 3D volumes
+           end if
+
+        !  model scores
+           do i=1,ngroup(itask,1000)-1
+             mm1=sum(ngroup(itask,:i-1))+1
+             mm2=sum(ngroup(itask,:i))
+             do j=i+1,ngroup(itask,1000)
+               mm3=sum(ngroup(itask,:j-1))+1
+               mm4=sum(ngroup(itask,:j))
+
+               IF(isconvex(itask,i)==1 .and. isconvex(itask,j)==1) then
+                 nconvexpair=nconvexpair+1
+                 if(iFCDI==1) then
+                     xtmp1(mm1:mm2,1)=x(mm1:mm2,activeset(ii(iFCDI)),itask)
+                     xtmp2(mm3:mm4,1)=x(mm3:mm4,activeset(ii(iFCDI)),itask)
+                     call convex1d_overlap(xtmp1(mm1:mm2,1),xtmp2(mm3:mm4,1),bwidth,overlap_n_tmp,overlap_size_tmp)
+                 else if(iFCDI==2) then
+                     xtmp1(mm1:mm2,:2)=x(mm1:mm2,activeset(ii(:2)),itask)
+                     xtmp2(mm3:mm4,:2)=x(mm3:mm4,activeset(ii(:2)),itask)
+                     call convex2d_overlap(xtmp1(mm1:mm2,:2),xtmp2(mm3:mm4,:2),bwidth,overlap_n_tmp,overlap_size_tmp)
+                 else if(iFCDI==3) then
+                     xtmp1(mm1:mm2,:3)=x(mm1:mm2,activeset(ii(:3)),itask)
+                     xtmp2(mm3:mm4,:3)=x(mm3:mm4,activeset(ii(:3)),itask)
+                     call convex3d_overlap(xtmp1(mm1:mm2,:3),xtmp2(mm3:mm4,:3),bwidth,overlap_n_tmp)
+                     overlap_size_tmp=0.d0  ! overlap_size is not yet available
+                 end if
+                 overlap_n=overlap_n+overlap_n_tmp
+                 if(overlap_size_tmp>=0) isoverlap=.true.
+ 
+                 if(.not. isoverlap) then  ! if separated
+                     if(mindist<overlap_size_tmp) then
+                         mindist=overlap_size_tmp ! renew the worst separation
+                         overlap_size=mindist  ! smaller, better
+                     end if
+                 else 
+                     if(overlap_size_tmp>=0.d0 .and. min(area(i),area(j))==0.d0) then !  0D feature
+                         overlap_size=max(0.d0,overlap_size)+1.d0   ! totally overlapped
+                    else if (overlap_size_tmp>=0.d0 .and. min(area(i),area(j))>0.d0) then ! not 0D feature
+                         overlap_size=max(0.d0,overlap_size)+overlap_size_tmp/(min(area(i),area(j)))  ! total overlap
+                    end if
+                 end if
+
+              ELSE IF(isconvex(itask,i)==1 .and. isconvex(itask,j)==0) then
+                 if(iFCDI==1) then
+                     xtmp1(mm1:mm2,1)=x(mm1:mm2,activeset(ii(iFCDI)),itask)
+                     xtmp2(mm3:mm4,1)=x(mm3:mm4,activeset(ii(iFCDI)),itask)
+                     overlap_n=overlap_n+convex1d_in(xtmp1(mm1:mm2,1),xtmp2(mm3:mm4,1),bwidth)
+                 else if(iFCDI==2) then
+                     xtmp1(mm1:mm2,:2)=x(mm1:mm2,activeset(ii(:2)),itask)
+                     xtmp2(mm3:mm4,:2)=x(mm3:mm4,activeset(ii(:2)),itask)
+                     overlap_n=overlap_n+convex2d_in(xtmp1(mm1:mm2,:2),xtmp2(mm3:mm4,:2),bwidth)
+                 else if(iFCDI==3) then
+                     xtmp1(mm1:mm2,:3)=x(mm1:mm2,activeset(ii(:3)),itask)
+                     xtmp2(mm3:mm4,:3)=x(mm3:mm4,activeset(ii(:3)),itask)
+                     call convex3D_hull(xtmp1(mm1:mm2,:3),ntri,triangles)
+                     overlap_n=overlap_n+convex3d_in(xtmp1(mm1:mm2,:3),xtmp2(mm3:mm4,:3),bwidth,ntri,triangles)
+                     deallocate(triangles)
+                 end if
+              ELSE IF(isconvex(itask,i)==0 .and. isconvex(itask,j)==1) then
+                 if(iFCDI==1) then
+                     xtmp1(mm1:mm2,1)=x(mm1:mm2,activeset(ii(iFCDI)),itask)
+                     xtmp2(mm3:mm4,1)=x(mm3:mm4,activeset(ii(iFCDI)),itask)
+                     overlap_n=overlap_n+convex1d_in(xtmp2(mm3:mm4,1),xtmp1(mm1:mm2,1),bwidth)
+                 else if(iFCDI==2) then
+                     xtmp1(mm1:mm2,:2)=x(mm1:mm2,activeset(ii(:2)),itask)
+                     xtmp2(mm3:mm4,:2)=x(mm3:mm4,activeset(ii(:2)),itask)
+                     overlap_n=overlap_n+convex2d_in(xtmp2(mm3:mm4,:2),xtmp1(mm1:mm2,:2),bwidth)
+                 else if(iFCDI==3) then
+                     xtmp1(mm1:mm2,:3)=x(mm1:mm2,activeset(ii(:3)),itask)
+                     xtmp2(mm3:mm4,:3)=x(mm3:mm4,activeset(ii(:3)),itask)
+                     call convex3D_hull(xtmp2(mm3:mm4,:3),ntri,triangles)
+                     overlap_n=overlap_n+convex3d_in(xtmp2(mm3:mm4,:3),xtmp1(mm1:mm2,:3),bwidth,ntri,triangles)
+                     deallocate(triangles)
+                 end if
+              END IF
+
+             end do ! j
+           end do ! i
+         end do ! itask
+         
+         if(isoverlap)  overlap_size=overlap_size/float(nconvexpair) ! smaller, better
+         !----------------------------------
+
+      ! store the good models 
+      if (any(overlap_n<select_overlap_n)) then
+         totalm=totalm+1
+         loc=maxloc(select_overlap_n)
+         select_overlap_n(loc(1))=overlap_n
+         select_overlap_size(loc(1))=overlap_size
+         select_model(loc(1),:iFCDI)=activeset(ii(:iFCDI))
+      else if (overlap_n==maxval(select_overlap_n) .and. any(overlap_size<select_overlap_size) )  then
+         totalm=totalm+1
+         loc=maxloc(select_overlap_size)
+         select_overlap_n(loc(1))=overlap_n
+         select_overlap_size(loc(1))=overlap_size
+         select_model(loc(1),:iFCDI)=activeset(ii(:iFCDI))         
+      end if
+
+      ! update models(123,124,125,134,135,145,234,...)
+      124 continue
+      if(iFCDI==1) cycle
+      ii(iFCDI)=ii(iFCDI)+1 
+
+      do j=iFCDI,3,-1
+        if(ii(j)> (nactive-(iFCDI-j)) ) ii(j-1)=ii(j-1)+1 
+      end do
+      do j=3,iFCDI
+        if(ii(j)> (nactive-iFCDI+j) ) ii(j)=ii(j-1)+1
+      end do
+      if(ii(2)>(nactive-(iFCDI-2)))  cycle
+      goto 123
+   end do
+
+! collecting the best models from all CPU cores
+   if(mpirank>0) then
+     call mpi_send(totalm,1,mpi_integer8,0,1,mpi_comm_world,status,mpierr)
+   else
+     do i=1,mpisize-1
+       call mpi_recv(bigm,1,mpi_integer8,i,1,mpi_comm_world,status,mpierr)
+       totalm=totalm+bigm
+     end do
+   end if
+   call mpi_bcast(totalm,1,mpi_integer8,0,mpi_comm_world,mpierr)
+
+   do j=1,min(int8(nmodel),totalm)
+      loc=minloc(select_overlap_n)
+      do k=1,min(int8(nmodel),totalm)
+        if(select_overlap_n(k)==select_overlap_n(loc(1)) .and. &
+           select_overlap_size(k)<select_overlap_size(loc(1))) loc(1)=k
+      end do
+      k=loc(1)
+      mscore(j,:)=(/dble(select_overlap_n(k)),select_overlap_size(k)/)
+
+      if(mpirank>0) then
+        call mpi_send(mscore(j,:),2,mpi_double_precision,0,1,mpi_comm_world,status,mpierr)
+      else
+        mpicollect(1,:)=mscore(j,:)
+        do i=1,mpisize-1
+          call mpi_recv(mpicollect(i+1,:),2,mpi_double_precision,i,1,mpi_comm_world,status,mpierr)
+        end do
+        loc=minloc(mpicollect(:,1))
+        do l=1,mpisize
+          if(mpicollect(l,1)==mpicollect(loc(1),1) .and. &
+             mpicollect(l,2)<mpicollect(loc(1),2) ) loc(1)=l
+        end do
+        l=loc(1)
+      end if
+      call mpi_bcast(l,1,mpi_integer,0,mpi_comm_world,mpierr)
+
+      if(mpirank==l-1) then
+        mID(j,:iFCDI)=select_model(k,:iFCDI)
+        select_overlap_n(k)=sum(nsample)*sum(ngroup(:,1000))   ! set a large value to avoid being reselected
+      end if
+      call mpi_bcast(mID(j,:iFCDI),iFCDI,mpi_integer,l-1,mpi_comm_world,mpierr)
+      call mpi_bcast(mscore(j,:),2,mpi_double_precision,l-1,mpi_comm_world,mpierr)
+   end do
+
+   ! Prediction for each sample to get the ypred
+   ypred=1.0  ! 'YES', classified!
+   do itask=1,ntask
+     do i=1,ngroup(itask,1000)  ! the element-1000 store the total number of groups
+     do j=sum(ngroup(itask,:i-1))+1,sum(ngroup(itask,:i))  ! samples in the group i
+        do k=1,ngroup(itask,1000)  ! check if sample j of group i is in the domain of group k
+          if(i==k) cycle  ! same group
+          if(isconvex(itask,k)==0) cycle ! group k is not convex domain
+          mm1=sum(ngroup(itask,:k-1))+1  ! samples in the group k
+          mm2=sum(ngroup(itask,:k))
+          if(iFCDI==1) then   ! 1D
+               if(convex1d_in(x(mm1:mm2,mID(1,iFCDI),itask),x(j:j,mID(1,iFCDI),itask),bwidth)==1) then
+                 ypred(sum(nsample(:itask-1))+j)=0.0   !'NO' 
+                 exit
+               end if
+          elseif(iFCDI==2) then
+              if(convex2d_in(x(mm1:mm2,[mID(1,:iFCDI)],itask),x(j:j,[mID(1,:iFCDI)],itask),bwidth)==1) then
+                ypred(sum(nsample(:itask-1))+j)=0.0   !'NO'
+                exit
+              end if
+         end if   ! 3D not yet implemented
+        end do
+      end do
+      end do
+   end do
+
+   if(mpirank==0 .and. iFCDI==desc_dim) then  ! output only the final dimension
+
+      call writeout2(x,iFCDI,mID(1,:iFCDI),expr,mscore)
+
+      ! output the top models to the folder Models
+2013  format(i10,i20,f20.5,a,*(i8))
+2014  format(i10,i20,a,*(i8))
+      if(nmodel>1) then
+        write(line_name,'(a,i4.4,a,i3.3)') 'top',min(int8(nmodel),totalm),'_D',iFCDI
+        open(fileunit,file='Models/'//trim(adjustl(line_name)),status='replace')
+        if(iFCDI<=2) then
+            write(fileunit,'(a10,3a20)') 'rank','#data_overlap','size_overlap','feature_ID'
+        else 
+            write(fileunit,'(a10,2a20)') 'rank','#data_overlap','feature_ID'
+        end if
+        do i=1,min(int8(nmodel),totalm)
+           if(iFCDI<=2) then
+               write(fileunit,2013,advance='no') i,nint(mscore(i,1)),mscore(i,2),'   (',mID(i,:iFCDI)
+           else
+               write(fileunit,2014,advance='no') i,nint(mscore(i,1)),'   (',mID(i,:iFCDI)
+           end if
+           write(fileunit,'(a)') ')'
+        end do
+        close(fileunit)
+      end if
+
+      if(iFCDI>=2) then
+        if(iFCDI==2) then
+           open(fileunit,file='convex2D_hull',status='replace')
+           write(fileunit,'(a)') 'The coordinates (x,y) of 2D-vertices arranged in clockwise direction'
+        end if
+        if(iFCDI==3) then
+           open(fileunit,file='convex3D_hull',status='replace')
+           write(fileunit,'(a)') 'Facet-triangles with each indicated by three indexes of the data of this class'
+           write(fileunit,'(a)') 'E.g.: ploting a 3D convex hull by using the MATLAB function: trisurf(Tri,X,Y,Z)'
+        end if
+
+         do itask=1,ntask
+           if(ntask>1) write(fileunit,'(a,i5)') 'Task :',itask
+         do i=1,ngroup(itask,1000)
+           mm1=sum(ngroup(itask,:i-1))+1
+           mm2=sum(ngroup(itask,:i))
+           if(isconvex(itask,i)==0) cycle
+           write(fileunit,'(a,i5,g20.10)') 'Hull',i
+           if(iFCDI==2) then
+              call convex2D_hull(x(mm1:mm2,[mID(1,:iFCDI)],itask),j,hull)
+              do k=1,j
+                write(fileunit,'(2e20.10)') hull(k,:)
+              end do
+           elseif(iFCDI==3) then
+              call convex3D_hull(x(mm1:mm2,[mID(1,:iFCDI)],itask),j,triangles)
+              do k=1,j
+                write(fileunit,'(3i5)') triangles(k,:)
+              end do
+              deallocate(triangles) ! allocated in the convex3D_hull subroutine
+           end if
+         end do
+         end do
+         close(fileunit)
+       end if
+
+   end if
+
+call mpi_barrier(mpi_comm_world,mpierr)
+end subroutine
+
+
+subroutine writeout2(x,iFCDI,id,expr,mscore)
+! output the information for classification
+integer iFCDI,i,j,id(:),k,l,itask,mm1,mm2,mm3,mm4,ndata_ol,ntri
+integer,allocatable:: triangles(:,:)
+real*8 mscore(:,:),tmp,tmp2,x(:,:,:)
+character expr(:)*200,line_name*100
+logical inside,convexpair
+
+ ! Output the information to SISSO.out
+ write(9,'(a)') ' '
+ write(9,'(i3,a)') iFCDI,'D descriptor: '
+ write(9,'(a)')'================================================================================'
+ 
+ do i=1,iFCDI
+ write(9,'(a,i3.3,3a,i6.6)') '     d',i,' = ',trim(adjustl(expr(id(i)))),'    feature_ID:',id(i)
+ end do
+ 
+ ! Output the data to the Models/data_top1
+ 2015 format(2a20,*(a19,i1.1))
+ 2016 format(i20,a20,*(e20.10))
+ ndata_ol=0
+ do itask=1,ntask
+   if(ntask>1) then
+       write(line_name,'(a,i3.3,a,i3.3,a)') 'desc_D',iFCDI,'_t',itask,'.dat'
+   else
+       write(line_name,'(a,i3.3,a)') 'desc_D',iFCDI,'.dat'
+   end if
+   open(fileunit,file='Models/data_top1/'//trim(adjustl(line_name)),status='replace')
+   write(fileunit,2015) 'index','classified?',('descriptor_',j,j=1,iFCDI)
+   do i=1,ngroup(itask,1000)  ! the groups in this task $itask
+   do j=sum(ngroup(itask,:i-1))+1,sum(ngroup(itask,:i))  ! samples in this group
+      inside=.false.
+      do k=1,ngroup(itask,1000)  ! check if sample j of group i is in the domain of group k
+        if(i==k) cycle
+        if(isconvex(itask,k)==0) cycle ! group k is not convex domain
+        mm1=sum(ngroup(itask,:k-1))+1
+        mm2=sum(ngroup(itask,:k))
+        if(iFCDI==1) then
+             if(convex1d_in(x(mm1:mm2,id(iFCDI),itask),x(j:j,id(iFCDI),itask),bwidth)==1) then
+               inside=.true.
+               exit
+             end if
+        elseif(iFCDI==2) then
+            if(convex2d_in(x(mm1:mm2,[id(:iFCDI)],itask),x(j:j,[id(:iFCDI)],itask),bwidth)==1) then
+              inside=.true.
+              exit
+            end if
+        elseif(iFCDI==3) then
+            call convex3D_hull(x(mm1:mm2,[id(:iFCDI)],itask),ntri,triangles)
+            if(convex3d_in(x(mm1:mm2,[id(:iFCDI)],itask),x(j:j,[id(:iFCDI)],itask),bwidth,ntri,triangles)==1) then
+              inside=.true.
+              deallocate(triangles)
+              exit
+            end if
+            deallocate(triangles)
+       end if
+     end do
+ 
+     if(inside) then
+         write(fileunit,2016) j,'NO',x(j,[id(:iFCDI)],itask)  ! unclassified
+         ndata_ol=ndata_ol+1
+     else
+         write(fileunit,2016) j,'YES',x(j,[id(:iFCDI)],itask) ! classified
+     end if
+   end do
+   end do
+   close(fileunit)
+ end do
+
+! write(9,'(/a,i10)') 'Number of data in all overlap regions:',int(mscore(1,1))
+ convexpair=.false.
+ do itask=1,ntask
+   do i=1,ngroup(itask,1000)
+     do j=i+1,ngroup(itask,1000)
+        if(isconvex(itask,i)==1 .and. isconvex(itask,j)==1) convexpair=.true.
+     end do
+   end do
+ end do
+ if(iFCDI<=2 .and. convexpair) then
+   write(9,'(/a,i10)') 'Number of data (double-counting removed) in all overlap regions: ', ndata_ol  
+   write(9,'(a,f15.5)') 'Size of the overlap (1D:length, 2D:area, 3D:volume):',mscore(1,2)
+   write(9,'(a)') 'Note: with negative, the magnitude means the min separation distance between domains.'
+ end if
+ write(9,'(a)')'================================================================================'
+ 
+end subroutine
+
+end module
+
diff --git a/src/FC.f90 b/src/FC.f90
new file mode 100644
index 0000000..5ae6d12
--- /dev/null
+++ b/src/FC.f90
@@ -0,0 +1,1680 @@
+! Licensed under the Apache License, Version 2.0 (the "License");
+! you may not use this file except in compliance with the License.
+! You may obtain a copy of the License at
+!
+!     http://www.apache.org/licenses/LICENSE-2.0
+!
+! Unless required by applicable law or agreed to in writing, software
+! distributed under the License is distributed on an "AS IS" BASIS,
+! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+! See the License for the specific language governing permissions and
+! limitations under the License.
+
+
+module FC
+! feature construction
+!----------------------------------------------------------------------
+
+use var_global
+use libsisso
+implicit none
+! variables used by subroutines and functions in this module
+
+type feature
+  real*8,allocatable:: feat_data(:,:)
+  character(len=str_len),allocatable:: feat_name(:),lastop(:)*10
+  integer*8,allocatable:: feat_comp(:)
+  real*8,allocatable:: feat_unit(:,:)
+end type
+
+type feature_selected
+  integer*8 nselect
+  real*8,allocatable:: feat_data(:,:),feat_score(:,:)
+  character(len=str_len),allocatable:: feat_name(:)
+  integer*8,allocatable:: feat_comp(:)
+end type
+
+type feature_sis
+  real*8,allocatable:: feat_data(:,:),feat_score(:,:)
+  character(len=str_len),allocatable:: feat_name(:)
+end type
+
+type(feature) gen
+type(feature_selected) sel
+type(feature_sis) sis
+
+integer*8 ntot,nthis,nreject,nbasic_select,nextra_select,icomb
+real*8 score_threshold
+real*8,allocatable:: trainy(:),trainy_c(:)
+character(len=str_len),allocatable::  reject(:)
+
+contains
+
+
+subroutine feature_construction
+implicit none
+integer   loc(1),ioerr
+character line*500,phiname*5,reject_file_name*100,superline*(20*(1+sum(nf_sis(:desc_dim))))
+real*8    bbb,ccc,aaa(mpisize),tag(npoint)
+integer*8 i,j,k,l,ll,mm1,mm2,nf(20),mpii,mpij,mpik,total_comb,nfpcore_this(mpisize),mpin(mpisize),mpin2(mpisize)
+real*8,allocatable:: fID(:)
+integer*8,allocatable:: order(:)
+logical,allocatable:: available(:)
+type(feature) inp
+
+! Stop if the whole feature space had been selected.
+IF (iFCDI>1 .and. nf_sis_avai(iFCDI-1)<nf_sis(iFCDI-1)) THEN
+   if(mpirank==0) then
+      write(*,'(a)') 'The whole feature-space has been selected! No more FC will be performed.' 
+      write(9,'(a)') 'The whole feature-space has been selected! No more FC will be performed.'
+   end if
+   return
+END IF
+
+! running time by FC
+if(mpirank==0) mytime.sFC=mpi_wtime()
+
+!------------------------------------------------------
+allocate(trainy(npoint))
+allocate(trainy_c(npoint))
+i=max(1000,nsf)
+allocate(inp.feat_data(npoint,i))   
+allocate(inp.feat_name(i))
+allocate(inp.lastop(i))
+allocate(inp.feat_unit(nunit,i))
+allocate(inp.feat_comp(i))
+
+allocate(gen.feat_data(npoint,i))
+allocate(gen.feat_name(i))
+allocate(gen.lastop(i))
+allocate(gen.feat_comp(i))
+allocate(gen.feat_unit(nunit,i))
+
+nbasic_select=nf_sis(iFCDI)  ! parallel
+if(mpisize==1) nbasic_select=2*nf_sis(iFCDI)  ! serial
+nextra_select=min(nf_sis(iFCDI),50000)
+j=nbasic_select+nextra_select
+
+allocate(sel.feat_data(npoint,j))
+allocate(sel.feat_score(2,j))
+allocate(sel.feat_name(j))
+allocate(sel.feat_comp(j))
+
+! initialization
+!------------------
+trainy=res
+score_threshold=-1.0
+inp.feat_unit(:,:nsf)=feature_units
+inp.feat_data(:,1:nsf)=pfdata(:,1:nsf)
+inp.feat_name(:nsf)=pfname(:nsf)
+inp.lastop=''     ! the last operation of the generated feature
+inp.feat_comp=0
+
+! The 'tag' to be used by 'fID' during redundant check
+do j=1,ntask
+do i=1,nsample(j)    
+  tag(sum(nsample(:j-1))+i)=1.0d0+0.001d0*i
+end do
+end do
+
+! avoid duplication in the selected space
+if(iFCDI>1) reject_file_name='SIS_subspaces/Uspace.expressions'
+
+nreject=0
+if(iFCDI >1 .and. mpirank==0) then
+   open(fileunit,file=trim(adjustl(reject_file_name)),status='old')
+
+   ! find the number of expressions in the file
+   do while(.true.)
+      read(fileunit,*,iostat=ioerr)
+      if(ioerr<0) exit
+      nreject=nreject+1
+   end do
+
+   if(nreject>0) allocate(reject(nreject))
+   rewind(fileunit)
+   
+   do j=1,nreject
+      read(fileunit,'(a)') line
+      call string_split(line,reject(j:j),' ')
+      reject(j)=adjustl(reject(j))
+   end do
+   close(fileunit)
+   !ordering
+   do i=1,nreject-1
+       loc(1)=i
+       do j=i+1,nreject
+         if(reject(j)>reject(loc(1))) loc(1)=j
+       end do
+       if(loc(1)>i) then
+         line=trim(reject(i))
+         reject(i)=reject(loc(1))
+         reject(loc(1))=trim(line)
+       end if
+   end do
+end if
+
+call mpi_bcast(nreject,1,mpi_integer8,0,mpi_comm_world,mpierr)
+if(mpirank/=0 .and. nreject>0) allocate(reject(nreject))
+if(nreject>0) call mpi_bcast(reject,nreject*str_len,mpi_character,0,mpi_comm_world,mpierr)
+!---------------------------------
+! Population Standard Deviation
+!---------------------------------
+
+if(ptype==1) then
+   do j=1,ntask
+    mm1=sum(nsample(:j-1))+1
+    mm2=sum(nsample(:j))
+    trainy_c(mm1:mm2)= trainy(mm1:mm2)-sum(trainy(mm1:mm2))/(mm2-mm1+1)   ! centered
+   end do
+end if
+
+!---------------------------------------------------
+! feature construction start ...
+! Phi0 is the initial space with primary features
+!---------------------------------------------------
+sel.nselect=0 ! number of selected features
+nf=0      ! number of generated features from each combination
+
+i=nsf ! total number of primary features
+j=0
+
+! no combination, just primary features
+total_comb=0
+call combine(inp,1,i,0,0,'NO',j,total_comb) 
+! ntot is forced to equal total number of pf as all are already stored in inp.feat, yet sel.nselect is not
+! necessarily to be ntot if some pf is not good (e.g. constant).
+! The purpose here is store good primary features in sel.feat
+
+ntot=nsf    ! total number of features
+nthis=ntot  ! number of features generated from this combine()
+write(phiname,'(a,i2.2)') 'Phi',0
+if(mpirank==0)  call writeout(phiname,sel.nselect,ntot)  
+
+if(rung==0) then
+  deallocate(gen.feat_data)
+  deallocate(gen.feat_name)
+  deallocate(gen.lastop)
+  deallocate(gen.feat_comp)
+  deallocate(gen.feat_unit)
+end if
+
+!-------------------------
+!creating PHI1, PHI2, ...
+!-------------------------
+do icomb=1,rung
+    if(mpirank==0) write(*,'(/a,i2.2,a)') 'Generating Phi',icomb,' ...'
+
+    call mpi_barrier(mpi_comm_world,mpierr)
+
+    if(icomb==rung) then
+       deallocate(gen.feat_data)
+       deallocate(gen.feat_name)
+       deallocate(gen.lastop)
+       deallocate(gen.feat_comp)
+       deallocate(gen.feat_unit)
+    end if
+
+    ! allocating equivalent workfload for each core
+    ! Total jobs: T=N*M + N(N-1)/2, where N=nthis, M = ntot-nthis
+    ! Divide N into n parts: X1, X2, ..., Xi, ..., where n=mpisize
+    ! X1=X1(X1-1)/2+X1(N-X1+M) = T/n --> X1
+    ! X2=X2(X2-1)/2+X2(N-X1-X2+M) = T/n --> X2
+    ! Xi=Xi(Xi-1)/2+Xi(N-X1-X2-...-Xi+M) = T/n -->Xi
+    ! Let bbb=N+M-1/2-sum(X1+X2+...+X{i-1}),ccc=-(N*M+N(N-1)/2)/n
+    ! Xi= bbb-sqrt(bbb^2+2ccc)
+    ! mpin: store the Xi, mpin2: starting position of the Xi part in nthis
+      mpin=0
+      mpin2=0
+      ccc=-(nthis*(ntot-nthis)+nthis*(nthis-1)/2.d0)/float(mpisize)
+      do mpii=1,mpisize
+         bbb=ntot-0.5-sum(aaa(:mpii-1))
+         if((bbb**2+2*ccc)>0) then
+            aaa(mpii)=bbb-sqrt(bbb**2+2*ccc)
+            mpin(mpii)=ceiling(aaa(mpii))
+            if (sum(mpin(:mpii))>nthis) then
+                  mpin(mpii)=nthis-sum(mpin(:mpii-1))
+                  exit
+            end if
+         end if
+      end do
+
+      if(sum(mpin)<nthis) then  ! it can happen when ntot==nthis
+         mpin(mpisize)=mpin(mpisize)+nthis-sum(mpin)
+      end if
+
+      do i=1,mpisize
+        if(mpin(i)>0) mpin2(i)=sum(mpin(1:i-1))+1
+      end do
+
+
+     ! check if array size need to be increased
+      i=ntot-nthis+(nthis-mpin2(mpirank+1)+1)   ! total number of features to be stored in this core
+      if(ubound(inp.feat_data,2)< i)  &
+        call addm_inp(i-ubound(inp.feat_data,2),inp)
+
+     ! broadcast ntot-nthis
+       if(ntot>nthis) then
+        i=ntot-nthis
+        call mpi_bcast(inp.feat_data(:,:i),i*npoint,mpi_double_precision,0,mpi_comm_world,mpierr)
+        call mpi_bcast(inp.feat_name(:i),i*str_len,mpi_character,0,mpi_comm_world,mpierr)
+        call mpi_bcast(inp.lastop(:i),i*10,mpi_character,0,mpi_comm_world,mpierr)
+        call mpi_bcast(inp.feat_unit(:,:i),i*nunit,mpi_double_precision,0,mpi_comm_world,mpierr)
+        call mpi_bcast(inp.feat_comp(:i),i,mpi_integer8,0,mpi_comm_world,mpierr)
+       end if
+
+      if(mpirank==0) then  ! send nthis 
+        do i=2,mpisize
+          j=ntot-nthis+mpin2(i)  ! starting position
+          k=nthis-mpin2(i)+1   ! size
+          if(mpin(i)>0) then
+          call mpi_send(inp.feat_data(:,j:ntot),k*npoint,mpi_double_precision,i-1,31,mpi_comm_world,status,mpierr)
+          call mpi_send(inp.feat_name(j:ntot),k*str_len,mpi_character,i-1,32,mpi_comm_world,status,mpierr)
+          call mpi_send(inp.lastop(j:ntot),k*10,mpi_character,i-1,33,mpi_comm_world,status,mpierr)
+          call mpi_send(inp.feat_comp(j:ntot),k,mpi_integer8,i-1,34,mpi_comm_world,status,mpierr)
+          call mpi_send(inp.feat_unit(:,j:ntot),k*nunit,mpi_double_precision,i-1,35,mpi_comm_world,status,mpierr)
+          end if
+        end do
+      else  ! receive
+        j=ntot-nthis+1  ! start
+        l=ntot-nthis+(nthis-mpin2(mpirank+1)+1)  ! end
+        k=nthis-mpin2(mpirank+1)+1  ! size
+        if(mpin(mpirank+1)>0) then
+        call mpi_recv(inp.feat_data(:,j:l),k*npoint,mpi_double_precision,0,31,mpi_comm_world,status,mpierr)
+        call mpi_recv(inp.feat_name(j:l),k*str_len,mpi_character,0,32,mpi_comm_world,status,mpierr)
+        call mpi_recv(inp.lastop(j:l),k*10,mpi_character,0,33,mpi_comm_world,status,mpierr)
+        call mpi_recv(inp.feat_comp(j:l),k,mpi_integer8,0,34,mpi_comm_world,status,mpierr)
+        call mpi_recv(inp.feat_unit(:,j:l),k*nunit,mpi_double_precision,0,35,mpi_comm_world,status,mpierr)
+        end if
+      end if
+
+      ! feature combination
+      i=ntot-nthis+1  ! start1
+      j=ntot-nthis+mpin(mpirank+1)  ! end1
+      k=ntot-nthis+(nthis-mpin2(mpirank+1)+1) ! end2
+
+      total_comb=mpin(mpirank+1)*(ntot-nthis)+(mpin(mpirank+1)-1)*mpin(mpirank+1)/2.0+&
+                 mpin(mpirank+1)*(nthis-mpin2(mpirank+1)+1-mpin(mpirank+1))+mpin(mpirank+1) ! binary+unary
+      if(mpin(mpirank+1)>0) &
+      call combine(inp,i,j,1,k,trim(adjustl(ops(icomb))),nf(icomb),total_comb)
+
+      call mpi_barrier(mpi_comm_world,mpierr)
+
+  IF (icomb < rung) THEN
+
+       !---------------------------
+       ! redundant check
+       !---------------------------
+       ! create nfpcore_this: number of new features generated in each CPU core
+       if(mpirank/=0) then
+          call mpi_send(nf(icomb),1,mpi_integer8,0,1,mpi_comm_world,status,mpierr)
+       else
+          nfpcore_this(1)=nf(icomb)
+          do mpii=1,mpisize-1
+               call mpi_recv(nfpcore_this(mpii+1),1,mpi_integer8,mpii,1,mpi_comm_world,status,mpierr)
+          end do
+       end if
+       call mpi_bcast(nfpcore_this,mpisize,mpi_integer8,0,mpi_comm_world,mpierr)
+
+       ! creating a unique scalar number fID for each feature
+       allocate(fID(nf(icomb))) 
+       do i=1,nf(icomb)
+          fID(i)=sqrt(sum((tag+gen.feat_data(:,i))**2))
+       end do
+
+       ! create the "available" to store the repetition information.
+        allocate(available(nf(icomb)))
+        available=.true.
+       ! create the "order" to store the ordering information for bisection method.
+        allocate(order(nf(icomb)+1))
+ 
+       !----------------------------------------------------------------------------------------
+       ! redundant check inside each core: creating the "order" and "available".
+       if(mpirank==0) then
+            write(*,'(a,i15)') 'Number of newly generated features: ',sum(nfpcore_this) 
+            write(*,'(a)') 'Redundant check on the newly generated features ...'
+       end if
+        
+       ! inside each core
+       call dup_scheck(nfpcore_this(mpirank+1),fID,gen.feat_name,gen.feat_comp,gen.feat_data,order,available)
+       ! between cores
+       if(mpisize>1) call dup_pcheck(nfpcore_this,fID,gen.feat_name,gen.feat_comp,gen.feat_data,order,available)
+       !-----------------------------------------------------------------------------------------
+
+
+       !---------------------------
+       ! remove duplicated features
+       !---------------------------
+       j=0
+       do i=1,nf(icomb)
+          if(available(i)) then
+             j=j+1
+             gen.feat_data(:,j)=gen.feat_data(:,i)
+             gen.feat_name(j)=gen.feat_name(i)
+             gen.lastop(j)=gen.lastop(i)
+             gen.feat_comp(j)=gen.feat_comp(i)
+             gen.feat_unit(:,j)=gen.feat_unit(:,i)
+             fID(j)=fID(i)
+          end if
+       end do
+       nf(icomb)=j
+
+       !renew nfpcore_this
+       if(mpirank/=0) then
+          call mpi_send(nf(icomb),1,mpi_integer8,0,1,mpi_comm_world,status,mpierr)
+       else
+          nfpcore_this(1)=nf(icomb)
+          do mpii=1,mpisize-1
+               call mpi_recv(nfpcore_this(mpii+1),1,mpi_integer8,mpii,1,mpi_comm_world,status,mpierr)
+          end do
+       end if
+       call mpi_bcast(nfpcore_this,mpisize,mpi_integer8,0,mpi_comm_world,mpierr)
+
+       nthis=sum(nfpcore_this)
+       ntot=ntot+nthis
+       if(mpirank==0) then
+           write(*,'(a,i15)') 'Number of newly generated features after redundant check: ',nthis
+       end if
+      !--------------------------------------------------------------------
+      ! cp results of gen.feat in all cores to the inp.feat in core0
+      !--------------------------------------------------------------------
+       i=nf(icomb)
+       if(mpirank>0 .and. i>0) then
+          call mpi_send(gen.feat_data(:,:i),npoint*i,mpi_double_precision,0,2,mpi_comm_world,status,mpierr)
+          call mpi_send(gen.feat_name(:i),i*str_len,mpi_character,0,3,mpi_comm_world,status,mpierr)
+          call mpi_send(gen.lastop(:i),i*10,mpi_character,0,4,mpi_comm_world,status,mpierr)
+          call mpi_send(gen.feat_comp(:i),i,mpi_integer8,0,5,mpi_comm_world,status,mpierr)
+          call mpi_send(gen.feat_unit(:,:i),nunit*i,mpi_double_precision,0,6,mpi_comm_world,status,mpierr)
+       else if(mpirank==0) then
+          if(ntot>ubound(inp.feat_data,2)) call addm_inp(ntot-ubound(inp.feat_data,2),inp)
+          ! from core0 to core0
+          inp.feat_data(:,ntot-nthis+1:ntot-nthis+i)=gen.feat_data(:,:i)
+          inp.lastop(ntot-nthis+1:ntot-nthis+i)=gen.lastop(:i)
+          inp.feat_comp(ntot-nthis+1:ntot-nthis+i)=gen.feat_comp(:i)
+          inp.feat_name(ntot-nthis+1:ntot-nthis+i)=gen.feat_name(:i)
+          inp.feat_unit(:,ntot-nthis+1:ntot-nthis+i)=gen.feat_unit(:,:i)
+          ! from all other cores to core0
+          do mpii=1,mpisize-1
+              j=ntot-nthis+sum(nfpcore_this(:mpii))+1    ! start
+              k=ntot-nthis+sum(nfpcore_this(:mpii+1))    ! end
+              l=nfpcore_this(mpii+1)     ! size
+              if(l>0) then
+              call mpi_recv(inp.feat_data(:,j:k),npoint*l,mpi_double_precision,mpii,2,mpi_comm_world,status,mpierr)
+              call mpi_recv(inp.feat_name(j:k),l*str_len,mpi_character,mpii,3,mpi_comm_world,status,mpierr)
+              call mpi_recv(inp.lastop(j:k),l*10,mpi_character,mpii,4,mpi_comm_world,status,mpierr)
+              call mpi_recv(inp.feat_comp(j:k),l,mpi_integer8,mpii,5,mpi_comm_world,status,mpierr)
+              call mpi_recv(inp.feat_unit(:,j:k),nunit*l,mpi_double_precision,mpii,6,mpi_comm_world,status,mpierr)
+              end if
+         end do
+        end if
+
+       !----------------------------------------------------------------
+       ! collect the newly selected features from each core to mpirank0
+       !----------------------------------------------------------------
+       if(mpirank/=0) then
+         call mpi_send(sel.nselect,1,mpi_integer8,0,5,mpi_comm_world,status,mpierr)
+       else
+         mpik=sel.nselect
+         do mpii=1,mpisize-1
+           call mpi_recv(mpij,1,mpi_integer8,mpii,5,mpi_comm_world,status,mpierr)
+           mpik=mpik+mpij  ! count the total number of selected features
+         end do
+       end if
+       !--------------------------------------
+       ! print the space information
+       !--------------------------------------
+        if(mpirank==0) then
+          write(phiname,'(a,i2.2)') 'Phi',icomb
+          call writeout(phiname,mpik,ntot)
+        end if
+       ! delete useless space
+       deallocate(order)
+       deallocate(available)
+       deallocate(fID)
+  END IF
+
+end do
+! -------- end of feature combination ------
+
+! release the spaces
+deallocate(inp.feat_data)
+deallocate(inp.feat_name)
+deallocate(inp.lastop)
+deallocate(inp.feat_comp)
+deallocate(inp.feat_unit)
+
+!---------------------------------------------------------------
+!  collect the information of selected features from all cores
+!---------------------------------------------------------------
+
+if(rung==0) then
+ nfpcore_this=sel.nselect
+else
+ if(mpirank/=0) then
+     call mpi_send(nf(rung),1,mpi_integer8,0,1,mpi_comm_world,status,mpierr)
+     call mpi_send(sel.nselect,1,mpi_integer8,0,2,mpi_comm_world,status,mpierr)
+ else
+      nfpcore_this(1)=sel.nselect
+      do mpii=1,mpisize-1
+           call mpi_recv(mpij,1,mpi_integer8,mpii,1,mpi_comm_world,status,mpierr)
+           nf(rung)=nf(rung)+mpij
+           call mpi_recv(nfpcore_this(mpii+1),1,mpi_integer8,mpii,2,mpi_comm_world,status,mpierr)
+      end do
+     write(phiname,'(a,i2.2)') 'Phi',rung
+     write(*,'(a,i15)') 'Total number of newly generated features: ',nf(rung)
+     call writeout(phiname,sum(nfpcore_this),ntot+nf(rung))
+ end if
+end if
+
+call mpi_bcast(nfpcore_this,mpisize,mpi_integer8,0,mpi_comm_world,mpierr)
+call mpi_barrier(mpi_comm_world,mpierr)
+!-----------------------------------------------------------------------------
+
+! redundant check for selected features
+!---------------------------------------
+allocate(available(nfpcore_this(mpirank+1)))
+available=.true.
+allocate(order(nfpcore_this(mpirank+1)+1))
+
+
+if(mpirank==0) write(*,'(/a)') 'Redundant check on selected features ...'
+! serial redundant check
+call dup_scheck(nfpcore_this(mpirank+1),sel.feat_score(1,:),sel.feat_name,sel.feat_comp,sel.feat_data,order,available)
+
+! parallel redundant check
+if(mpisize>1) call dup_pcheck(nfpcore_this,sel.feat_score(1,:),sel.feat_name,sel.feat_comp,sel.feat_data,order,available)
+
+!---------------------------------------
+! sure independence screening
+!---------------------------------------
+if(mpirank==0) then
+    allocate(sis.feat_data(npoint,nf_sis(iFCDI)))
+    allocate(sis.feat_score(2,nf_sis(iFCDI)))
+    allocate(sis.feat_name(nf_sis(iFCDI)))
+end if
+
+! selecting the best features from sel.XXX to sis.XXX
+call sure_indep_screening(nfpcore_this,available)
+
+! output the selected feature space
+!--------------------------------------
+if(mpirank==0) then
+   ! output the space.expressions
+   if(iFCDI==1) then
+     open(fileunit,file='SIS_subspaces/Uspace.expressions',status='replace')
+   else
+     open(fileunit,file='SIS_subspaces/Uspace.expressions',position='append',status='old')
+   end if
+   if(ptype==1) then
+      do i=1,nf_sis_avai(iFCDI)
+        write(fileunit,'(2a,f12.4)') trim(sis.feat_name(i)),'     SIS_score =',sis.feat_score(1,i)
+      end do
+   else if(ptype==2) then
+      do i=1,nf_sis_avai(iFCDI)
+        sis.feat_score(1,i)=1.d0/sis.feat_score(1,i)-1.d0   ! score 1: overlap_n, score 2: normalized overlap_length
+        if(abs(sis.feat_score(2,i))>1d9) then
+           write(fileunit,'(2a,i6,a,f12.4)') trim(sis.feat_name(i)),'    N_overlap =',nint(sis.feat_score(1,i))        
+        else
+            if( nint(sis.feat_score(1,i))/=0 ) then  ! overlapped
+               sis.feat_score(2,i)=1.d0/sis.feat_score(2,i)-1.d0  ! overlapped length
+            else   
+               sis.feat_score(2,i)=-sis.feat_score(2,i)  ! separation distance
+            end if
+           write(fileunit,'(2a,i6,a,f12.4)') trim(sis.feat_name(i)),'    N_overlap =',nint(sis.feat_score(1,i)),&
+                                                                   '    S_overlap =',sis.feat_score(2,i)
+        end if
+      end do
+   end if
+   close(fileunit)
+
+   ! output the data
+2001  format(*(e20.10))
+     do j=1,ntask
+        mm1=sum(nsample(:j-1))+1
+        mm2=sum(nsample(:j))
+        if(ntask>1) then
+            write(line,'(a,i3.3,a)') 'Uspace_t',j,'.dat'
+        else
+            write(line,'(a)') 'Uspace.dat'
+        end if
+        
+        if(iFCDI>1) then
+            call rename('SIS_subspaces/'//trim(adjustl(line)),'SIS_subspaces/'//trim(adjustl(line))//'_tmp')
+            open(fileunit+1,file='SIS_subspaces/'//trim(adjustl(line))//'_tmp',status='old')
+        end if
+        open(fileunit,file='SIS_subspaces/'//trim(adjustl(line)),status='replace')
+
+        do ll=mm1,mm2
+           superline=''
+           if(iFCDI==1) then
+               if(ptype==1) then
+                    write(fileunit,2001) trainy(ll),sis.feat_data(ll,:nf_sis_avai(iFCDI))
+               elseif(ptype==2) then
+                    write(fileunit,2001) sis.feat_data(ll,:nf_sis_avai(iFCDI))
+               endif
+           elseif(iFCDI>1) then
+               read(fileunit+1,'(a)') superline
+               if(ptype==1) then
+                  write(superline(20*(sum(nf_sis_avai(:iFCDI-1))+1)+1:),2001) sis.feat_data(ll,:nf_sis_avai(iFCDI))
+               elseif(ptype==2) then
+                  write(superline(20*(sum(nf_sis_avai(:iFCDI-1)))+1:),2001) sis.feat_data(ll,:nf_sis_avai(iFCDI))
+               endif
+               write(fileunit,'(a)') trim(superline)
+          end if
+        end do
+        close(fileunit)
+        if(iFCDI>1) close(fileunit+1,status='delete')
+     end do
+
+   write(*,'(3a,i10)') 'Size of the SIS-selected subspace from ',phiname,': ',nf_sis_avai(iFCDI)
+   write(9,'(3a,i10)') 'Size of the SIS-selected subspace from ',phiname,': ',nf_sis_avai(iFCDI)
+end if  
+
+! release all the spaces
+deallocate(trainy)
+deallocate(trainy_c)
+deallocate(sel.feat_data)
+deallocate(sel.feat_comp)
+deallocate(sel.feat_score)
+deallocate(sel.feat_name)
+deallocate(available)
+deallocate(order)
+
+if(mpirank==0) then
+  deallocate(sis.feat_data)
+  deallocate(sis.feat_score)
+  deallocate(sis.feat_name)
+end if
+if(nreject>0) deallocate(reject)
+
+call mpi_barrier(mpi_comm_world,mpierr)
+if(mpirank==0) then
+   mytime.eFC=mpi_wtime()
+   write(9,'(a,f15.2)') 'Time (second) used for this FC: ',mytime.eFC-mytime.sFC
+end if
+
+end subroutine
+
+
+subroutine combine(myinp,s1,e1,s2,e2,myops,nf,total_comb) 
+implicit none
+type(feature) myinp
+real progress
+real*8 data_tmp(ubound(myinp.feat_data,1)),unit_tmp(ubound(myinp.feat_unit,1))
+integer*8 i,j,k,kk,kkk,l,nf,comp_tmp,counter,total_comb,s1,e1,s2,e2
+character(len=*) myops
+character(len=str_len) name_tmp,lastop_tmp*10
+logical skip
+
+counter=0
+progress=0.2
+
+do i=s1,e1
+  
+! no operation
+      IF(trim(adjustl(myops))=='NO') THEN
+          lastop_tmp=''
+          comp_tmp=myinp.feat_comp(i)
+          name_tmp='('//trim(adjustl(myinp.feat_name(i)))//')'
+          unit_tmp=myinp.feat_unit(:,i)
+          data_tmp=myinp.feat_data(:,i)
+          call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+          cycle
+      END IF
+
+! unary operators
+      counter=counter+1
+      comp_tmp=myinp.feat_comp(i)+1
+      if(comp_tmp>fcomplexity) goto 599
+
+      ! exp
+      IF(index(myops,'(exp)')/=0 ) then
+      if(  index(myinp.lastop(i),'(exp')==0 .and. index(myinp.lastop(i),'(log)')==0 ) then ! avoid exp(exp( and exp(log(
+        lastop_tmp='(exp)'
+        name_tmp='exp('//trim(adjustl(myinp.feat_name(i)))//')'
+        unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(exp)')
+        data_tmp=exp(myinp.feat_data(:,i))
+        call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+      end if
+      END IF
+
+      ! exp-
+      IF(index(myops,'(exp-)')/=0 ) then
+      if( index(myinp.lastop(i),'(exp')==0  .and. index(myinp.lastop(i),'(log)')==0 ) then ! avoid exp(exp( and exp(log(
+        lastop_tmp='(exp-)'
+        name_tmp='exp(-'//trim(adjustl(myinp.feat_name(i)))//')'
+        unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(exp-)')
+        data_tmp=exp(-myinp.feat_data(:,i))
+        call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+      end if
+      END IF
+
+      ! ^-1
+      IF(index(myops,'(^-1)')/=0) then
+        if(minval(abs(myinp.feat_data(:,i)))>1d-50 ) then  ! avoid divided by zero
+           lastop_tmp='(^-1)'
+           name_tmp='('//trim(adjustl(myinp.feat_name(i)))//')^-1'
+           unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(^-1)')
+           data_tmp=(myinp.feat_data(:,i))**(-1)
+           call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+        end if
+       END IF
+
+      ! scd: Standard Cauchy Distribution
+      IF(index(myops,'(scd)')/=0) then
+           lastop_tmp='(scd)'
+           name_tmp='scd('//trim(adjustl(myinp.feat_name(i)))//')'
+           unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(scd)')
+           data_tmp=1.0d0/(PI*(1.0d0+(myinp.feat_data(:,i))**2))
+           call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+       END IF
+
+      ! ^2
+      IF(index(myops,'(^2)')/=0) then
+        if(index(myinp.lastop(i),'(sqrt)')==0 ) then ! avoid (sqrt())^2
+           lastop_tmp='(^2)'
+           name_tmp='('//trim(adjustl(myinp.feat_name(i)))//')^2'
+           unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(^2)')
+           data_tmp=(myinp.feat_data(:,i))**2
+           call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+        end if
+       END IF
+
+      ! ^3
+      IF(index(myops,'(^3)')/=0) then
+        if(index(myinp.lastop(i),'(cbrt)')==0 ) then ! avoid (cbrt())^3
+         lastop_tmp='(^3)'
+         name_tmp='('//trim(adjustl(myinp.feat_name(i)))//')^3'
+         unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(^3)')
+         data_tmp=(myinp.feat_data(:,i))**3
+         call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+       end if
+     END IF
+
+      ! ^6
+      IF(index(myops,'(^6)')/=0) then
+         lastop_tmp='(^6)'
+         name_tmp='('//trim(adjustl(myinp.feat_name(i)))//')^6'
+         unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(^6)')
+         data_tmp=(myinp.feat_data(:,i))**6
+         call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+       END IF
+
+      ! sqrt
+      IF(index(myops,'(sqrt)')/=0) then
+        if(index(myinp.lastop(i),'(^2)')==0 ) then  ! avoid sqrt((^2))
+          if( minval(myinp.feat_data(:,i))>0  ) then
+              lastop_tmp='(sqrt)'     
+              name_tmp='sqrt('//trim(adjustl(myinp.feat_name(i)))//')'
+              unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(sqrt)')
+              data_tmp=sqrt(myinp.feat_data(:,i))
+              call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+          end if
+         end if
+       END IF
+
+      ! cbrt: cube root
+      IF(index(myops,'(cbrt)')/=0) then
+        if(index(myinp.lastop(i),'(^3)')==0 ) then  ! avoid cbrt((^3))
+              lastop_tmp='(cbrt)'
+              name_tmp='cbrt('//trim(adjustl(myinp.feat_name(i)))//')'
+              unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(cbrt)')
+              data_tmp=(myinp.feat_data(:,i))**(1.d0/3.d0)
+              call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+         end if
+       END IF
+
+      ! log
+      IF(index(myops,'(log)')/=0) then
+        if( index(myinp.lastop(i),'(exp')==0 .and. index(myinp.lastop(i),'(log)')==0  ) then ! avoid log(exp( and log(log(
+          if( minval(myinp.feat_data(:,i))>0 ) then
+              lastop_tmp='(log)'
+              name_tmp='log('//trim(adjustl(myinp.feat_name(i)))//')'
+              unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(log)')
+              data_tmp=log(myinp.feat_data(:,i))
+              call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+          end if
+         end if
+       END IF
+
+      ! sin
+      IF(index(myops,'(sin)')/=0) then
+         lastop_tmp='(sin)'
+         name_tmp='sin('//trim(adjustl(myinp.feat_name(i)))//')'
+         unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(sin)')
+         data_tmp=sin(myinp.feat_data(:,i))
+         call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+     END IF
+
+      ! cos
+      IF(index(myops,'(cos)')/=0) then
+         lastop_tmp='(cos)'
+         name_tmp='cos('//trim(adjustl(myinp.feat_name(i)))//')'
+         unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(cos)')
+         data_tmp=cos(myinp.feat_data(:,i))
+         call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+     END IF
+
+599  continue
+
+  ! binary operators
+  do j=s2,e2
+
+      if( j>=s1 .and. j<=i ) cycle
+
+      counter=counter+1
+      comp_tmp=myinp.feat_comp(i)+myinp.feat_comp(j)+1
+      if(comp_tmp>fcomplexity) goto 602
+
+      ! sum and subtract 
+       IF(index(myops,'(+)')/=0 .or. index(myops,'(-)')/=0  .or. index(myops,'(|-|)')/=0   ) THEN
+
+               ! different units
+               if ( maxval(abs(myinp.feat_unit(:,i)-myinp.feat_unit(:,j)))>1d-8 ) goto 600
+
+               !---
+               IF(index(myops,'(+)')/=0) then
+                   lastop_tmp='(+)'
+                   name_tmp='('//trim(adjustl(myinp.feat_name(i)))//'+'//trim(adjustl(myinp.feat_name(j)))//')'
+                   unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,j),'(+)')  
+                   data_tmp=myinp.feat_data(:,i)+myinp.feat_data(:,j)
+                   call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+                END IF
+                !---
+
+                IF(index(myops,'(-)')/=0) then  ! A-B
+                   lastop_tmp='(-)'
+                   name_tmp='('//trim(adjustl(myinp.feat_name(i)))//'-'//trim(adjustl(myinp.feat_name(j)))//')'
+                   if(index(myops,'(+)')==0) unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,j),'(-)')   
+                   data_tmp=myinp.feat_data(:,i)-myinp.feat_data(:,j)
+                   call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+
+                  if(icomb<rung) then  ! B-A (no need if icomb=rung because of the coeff.)
+                    ! lastop_tmp='(-)'   ! same with above
+                     name_tmp='('//trim(adjustl(myinp.feat_name(j)))//'-'//trim(adjustl(myinp.feat_name(i)))//')'
+                    ! unit_tmp=dimcomb(inp.feat_unit(:,i),dim_in2(:,j),'(-)')   ! same with above
+                     data_tmp=-data_tmp
+                     call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+                  end if
+                 END IF
+
+                !-----
+                IF(index(myops,'(|-|)')/=0) then  ! absolute difference
+                  lastop_tmp='(|-|)'
+                  name_tmp='abs('//trim(adjustl(myinp.feat_name(i)))//'-'//trim(adjustl(myinp.feat_name(j)))//')'
+                  if(index(myops,'(+)')==0) unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,j),'(|-|)')  
+                  if(index(myops,'(-)')/=0) then
+                       data_tmp=abs(data_tmp)
+                  else
+                       data_tmp=abs(myinp.feat_data(:,i)-myinp.feat_data(:,j))
+                  end if
+                  call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+                 END IF
+                 !----
+         END IF
+
+600   continue
+
+      ! multiplication and division
+
+           !multiplication
+            IF(index(myops,'(*)')/=0) then
+                lastop_tmp='(*)'
+                name_tmp='('//trim(adjustl(myinp.feat_name(i)))//'*'//trim(adjustl(myinp.feat_name(j)))//')'
+                unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,j),'(*)')
+                data_tmp=myinp.feat_data(:,i)*myinp.feat_data(:,j)
+                call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+            END IF
+
+          ! division
+           IF(index(myops,'(/)')/=0) then
+
+              ! avoid A/(B/C)=(A*C)/B and (A/B)/C=A/(B*C) which already exist via previous (*)
+              if(index(myops,'(*)')/=0 .and. &
+                (index(myinp.lastop(i),'(/)')/=0 .or. index(myinp.lastop(j),'(/)')/=0) .and. &
+                (icomb < rung)  )  goto 602   ! do nothing
+
+              skip=.false.
+              ! avoid A*B/B and A*B/A
+              if(index(myinp.lastop(i),'(*)')/=0) then
+                   k=index(myinp.feat_name(i),trim(adjustl(myinp.feat_name(j))))
+                 if(k>0) then
+                   kk=len_trim(myinp.feat_name(i)(k:))
+                   kkk=len_trim(adjustl(myinp.feat_name(j)))
+                  if( (trim(adjustl(myinp.feat_name(i)(:k-1)))=='(' &   ! j being the left part of i
+                      .and. myinp.feat_name(i)(k+kkk:k+kkk)=='*') .or. &
+                      (kk==kkk+1 .and. myinp.feat_name(i)(k-1:k-1)=='*')) then ! j being the right part of i
+                      if(index(myops,'(^-1)')/=0) then  ! i/j is no need. what about j/i?
+                          goto 602   ! no j/i
+                      else
+                          goto 601   ! do j/i
+                      end if
+                   end if
+                 end if
+              end if
+
+              ! avoid A/(A*B) and B/(A*B)
+             if(index(myinp.lastop(j),'(*)')/=0) then
+                 k=index(myinp.feat_name(j),trim(adjustl(myinp.feat_name(i))))
+                if(k>0) then
+                  kk=len_trim(myinp.feat_name(j)(k:))
+                  kkk=len_trim(adjustl(myinp.feat_name(i)))
+                  if((trim(adjustl(myinp.feat_name(j)(:k-1)))=='(' &
+                      .and. myinp.feat_name(j)(k+kkk:k+kkk)=='*') .or. &   ! i being the left part of j
+                     (kk==kkk+1 .and. myinp.feat_name(j)(k-1:k-1)=='*')) then ! i beingn the right part of j
+                      if(index(myops,'(^-1)')/=0) then  ! j/i is no need. what about i/j?
+                          goto 602  ! no i/j
+                      else
+                          skip=.true.  ! do i/j
+                      end if
+                  end if
+                end if
+             end if
+
+              !i/j
+              !---------
+              lastop_tmp='(/)'
+              name_tmp='('//trim(adjustl(myinp.feat_name(i)))//'/'//trim(adjustl(myinp.feat_name(j)))//')'
+              unit_tmp=dimcomb(myinp.feat_unit(:,i),myinp.feat_unit(:,j),'(/)')
+              if(minval(abs(myinp.feat_data(:,j)))>1d-50 ) then 
+              data_tmp=myinp.feat_data(:,i)/myinp.feat_data(:,j)
+              call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+              end if
+              !------
+
+     601      continue
+              if(skip) goto 602
+
+            !j/i
+            !---------
+                 lastop_tmp='(/)'
+                 name_tmp='('//trim(adjustl(myinp.feat_name(j)))//'/'//trim(adjustl(myinp.feat_name(i)))//')'
+                 unit_tmp=dimcomb(myinp.feat_unit(:,j),myinp.feat_unit(:,i),'(/)')
+                 if(minval(abs(myinp.feat_data(:,i)))>1d-50) then
+                 data_tmp=myinp.feat_data(:,j)/myinp.feat_data(:,i)
+                 call isgoodf(data_tmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+                 end if
+           END if
+
+   602   continue
+
+  end do
+
+  if(float(counter)/float(total_comb)>=progress) then
+     do while( (float(counter)/float(total_comb)) >= (progress +0.2) )
+           progress = progress+0.2
+     end do
+     write(*,'(a,i4,2(a,i15),a,f6.1,a)') &
+     'mpirank = ',mpirank,'  #generated =',nf,'  #selected =',sel.nselect,'  progress =',progress*100,'%'
+     progress=progress+0.2
+  end if
+
+end do
+
+end subroutine
+
+
+function goodf(feat_data,feat_name,feat_unit,feat_comp)
+integer*8 i,j,k,l,ll,mm1,mm2,feat_comp
+real*8 feat_data(:),feat_unit(:),scoretmp(2),maxabs
+character(len=*) feat_name
+logical goodf,lsame
+
+goodf=.true.
+
+! delete constant/zero/infinity features 
+!do ll=1,ntask
+!  mm1=sum(nsample(:ll-1))+1
+!  mm2=sum(nsample(:ll))
+   mm1=1
+   mm2=sum(nsample(:ntask))
+  if(maxval(abs(feat_data(mm1:mm2)-feat_data(mm1)))<=1d-8) then
+    goodf=.false. ! constant feature.
+    return
+  end if
+  maxabs=maxval(abs(feat_data(mm1:mm2)))
+  if(maxabs>1d50 .or. maxabs<=1d-50) then
+    goodf=.false. ! intinity or zero
+    return
+  end if
+!end do
+
+! not selected but can be used for further combination
+if(maxabs>fmax_max .or. maxabs<fmax_min) return
+
+! feature score 
+if(ptype==1) scoretmp=sis_score(feat_data,trainy_c)
+if(ptype==2) scoretmp=sis_score(feat_data,trainy) !classification
+
+if (scoretmp(1)<score_threshold) return
+
+! reject the previously selected features
+if(nreject>0) then
+  feat_name=adjustl(feat_name)
+  lsame=.false.
+  i=0; j=nreject; k=i+ceiling((j-i)/2.0)
+  do while(i/=j)
+     if(trim(feat_name)==trim(reject(k))) then
+       lsame=.true.
+       i=j
+     else if(feat_name<reject(k)) then
+       i=k;
+     else if(feat_name>reject(k)) then
+       j=k;
+     end if
+
+     if(k==i+ceiling((j-i)/2.0)) then
+         i=j
+     else
+         k=i+ceiling((j-i)/2.0)
+     end if
+  end do
+  if(lsame) return
+end if
+
+!--------------------------
+! selected
+!--------------------------
+sel.nselect=sel.nselect+1
+sel.feat_data(:,sel.nselect)=feat_data
+sel.feat_comp(sel.nselect)=feat_comp
+sel.feat_name(sel.nselect)=feat_name
+sel.feat_score(:,sel.nselect)=scoretmp
+if( sel.nselect== nbasic_select+nextra_select ) call update_select
+
+end function
+
+
+subroutine addm_gen(n)
+! increase array size
+real*8,allocatable:: real2d(:,:)
+integer*8,allocatable:: integer1d(:)
+character(len=str_len),allocatable:: char1d(:),char1d2(:)*10
+integer*8 i,j,k,n
+i=ubound(gen.feat_data,1)
+j=ubound(gen.feat_data,2)
+
+! gen.feat_data
+allocate(real2d(i,j))
+real2d=gen.feat_data
+deallocate(gen.feat_data)
+allocate(gen.feat_data(i,j+n))
+gen.feat_data(:,:j)=real2d
+deallocate(real2d)
+!-----
+!gen.feat_name
+allocate(char1d(j))
+char1d=gen.feat_name
+deallocate(gen.feat_name)
+allocate(gen.feat_name(j+n))
+gen.feat_name(:j)=char1d
+deallocate(char1d)
+!----
+!gen.lastop
+allocate(char1d2(j))
+char1d2=gen.lastop
+deallocate(gen.lastop)
+allocate(gen.lastop(j+n))
+gen.lastop(:j)=char1d2
+deallocate(char1d2)
+!----
+!gen.feat_comp
+allocate(integer1d(j))
+integer1d=gen.feat_comp
+deallocate(gen.feat_comp)
+allocate(gen.feat_comp(j+n))
+gen.feat_comp(:j)=integer1d
+deallocate(integer1d)
+!gen.feat_unit
+i=ubound(gen.feat_unit,1)
+allocate(real2d(i,j))
+real2d=gen.feat_unit
+deallocate(gen.feat_unit)
+allocate(gen.feat_unit(i,j+n))
+gen.feat_unit(:,:j)=real2d
+deallocate(real2d)
+!--
+end subroutine
+
+
+subroutine addm_inp(n,inp)
+! increase array size
+type(feature) inp
+real*8,allocatable:: real2d(:,:)
+integer*8,allocatable:: integer1d(:)
+character(len=str_len),allocatable:: char1d(:),char1d2(:)*10
+integer*8 i,j,k,n
+i=ubound(inp.feat_data,1)
+j=ubound(inp.feat_data,2)
+
+! inp.feat_data
+allocate(real2d(i,j))
+real2d=inp.feat_data
+deallocate(inp.feat_data)
+allocate(inp.feat_data(i,j+n))
+inp.feat_data(:,:j)=real2d
+deallocate(real2d)
+!-----
+!inp.feat_name
+allocate(char1d(j))
+char1d=inp.feat_name
+deallocate(inp.feat_name)
+allocate(inp.feat_name(j+n))
+inp.feat_name(:j)=char1d
+deallocate(char1d)
+!----
+!inp.lastop
+allocate(char1d2(j))
+char1d2=inp.lastop
+deallocate(inp.lastop)
+allocate(inp.lastop(j+n))
+inp.lastop(:j)=char1d2
+deallocate(char1d2)
+!---
+!inp.feat_comp
+allocate(integer1d(j))
+integer1d=inp.feat_comp
+deallocate(inp.feat_comp)
+allocate(inp.feat_comp(j+n))
+inp.feat_comp(:j)=integer1d
+deallocate(integer1d)
+!inp.feat_unit
+i=ubound(inp.feat_unit,1)
+allocate(real2d(i,j))
+real2d=inp.feat_unit
+deallocate(inp.feat_unit)
+allocate(inp.feat_unit(i,j+n))
+inp.feat_unit(:,:j)=real2d
+deallocate(real2d)
+!--
+end subroutine
+
+
+function dimcomb(dim1,dim2,op)
+! calculate the units for new features
+! unary operator: set dim1 and dim2 the same
+real*8 dim1(:),dim2(:),dimcomb(ubound(dim1,1))
+character(len=*) op
+integer i,j,k
+
+if(trim(adjustl(op))=='(+)' .or. trim(adjustl(op))=='(-)' .or. trim(adjustl(op))=='(|-|)'  ) then
+   do i=1,ubound(dim1,1)
+      if(abs(dim1(i)-dim2(i))>1d-8) stop 'Error: different units in linear combination'
+   end do
+   dimcomb=dim1
+else if(trim(adjustl(op))=='(*)') then
+   dimcomb=dim1+dim2
+else if(trim(adjustl(op))=='(/)') then
+   dimcomb=dim1-dim2
+else if(trim(adjustl(op))=='(exp)') then   
+   dimcomb=0.d0
+else if(trim(adjustl(op))=='(exp-)') then
+   dimcomb=0.d0
+else if(trim(adjustl(op))=='(log)') then
+   dimcomb=0.d0
+else if(trim(adjustl(op))=='(scd)') then 
+   dimcomb=0.d0
+else if(trim(adjustl(op))=='(sin)') then
+   dimcomb=0.d0
+else if(trim(adjustl(op))=='(cos)') then
+   dimcomb=0.d0
+else if(trim(adjustl(op))=='(^-1)') then
+   dimcomb=dim1*(-1)
+else if(trim(adjustl(op))=='(^2)') then
+   dimcomb=dim1*2
+else if(trim(adjustl(op))=='(^3)') then
+   dimcomb=dim1*3
+else if(trim(adjustl(op))=='(^6)') then
+   dimcomb=dim1*6
+else if(trim(adjustl(op))=='(sqrt)') then
+   dimcomb=dim1/2.d0
+else if(trim(adjustl(op))=='(cbrt)') then
+   dimcomb=dim1/3.d0
+end if
+end function
+
+
+subroutine writeout(phiname,i,k)
+integer*8 i,j,k
+character(len=*) phiname
+write(*,'(3a,i15)') 'Total number of features in the space ',trim(phiname),':',k
+write(9,'(3a,i15)') 'Total number of features in the space ',trim(phiname),':',k
+end subroutine
+
+
+function simpler(complexity1,complexity2,name1,name2)
+integer*8 simpler,complexity1,complexity2
+character(len=*) name1,name2
+
+if(complexity1<complexity2) then
+   simpler=1
+else if(complexity1>complexity2) then
+   simpler=2
+else if(complexity1==complexity2) then
+   if(len_trim(name1)<len_trim(name2) .or. &
+     (len_trim(name1)==len_trim(name2) .and. trim(name1)<=trim(name2) ) ) then
+      simpler=1
+   else
+      simpler=2
+   end if
+end if
+end function
+
+
+subroutine update_availability(available,pavailable)
+logical available(:),pavailable(:)
+if( any(available) ) then
+   pavailable(mpirank+1)=.true.
+else
+   pavailable(mpirank+1)=.false.
+end if
+end subroutine
+
+
+subroutine dup_pcheck(nfpcore_this,fID,fname,complexity,feat,order,available)
+! output array 'available'
+real*8 fID(:),feat(:,:)
+integer*8 i,j,k,l,ll,mpii,mpij,nfpcore_this(:),mpim(mpisize),mpiloc(mpisize),order(:),loc(1),&
+          complexity(:),simpler_result
+character(len=*) fname(:)
+logical available(:)
+real*8,allocatable:: compidentity(:),compfeat(:,:)
+character(len=str_len),allocatable:: compname(:)
+integer*8,allocatable:: compcomplexity(:)
+
+IF(mpisize>1) THEN
+   mpim=nfpcore_this  ! number of features in each core
+   do i=1,mpisize 
+      loc(1:1)=maxloc(mpim)
+      mpiloc(i)=loc(1)  ! sort the cores by the number of features from high to low
+      mpim(loc(1))=-1
+   end do
+
+   do k=1,mpisize-1
+
+     if(nfpcore_this(mpiloc(k))>0) then
+         allocate(compidentity(nfpcore_this(mpiloc(k))))
+         allocate(compname(nfpcore_this(mpiloc(k))))
+         allocate(compcomplexity(nfpcore_this(mpiloc(k))))
+         allocate(compfeat(npoint,nfpcore_this(mpiloc(k))))
+
+         if(mpirank==mpiloc(k)-1) then   ! every core knows the same mpiloc
+           compidentity=fID(:nfpcore_this(mpiloc(k)))
+           compname=fname(:nfpcore_this(mpiloc(k)))
+           compcomplexity=complexity(:nfpcore_this(mpiloc(k)))
+           compfeat=feat(:,:nfpcore_this(mpiloc(k)))
+         end if
+
+         ! broadcast compidentity from core mpiloc(k)-1 to all other cores
+          call mpi_bcast(compidentity,nfpcore_this(mpiloc(k)),mpi_double_precision,mpiloc(k)-1,mpi_comm_world,mpierr)
+         ! broadcast the feature expressions
+          call mpi_bcast(compname,nfpcore_this(mpiloc(k))*str_len,mpi_character,mpiloc(k)-1,mpi_comm_world,mpierr)
+         ! broadcast the feature complexities
+          call mpi_bcast(compcomplexity,nfpcore_this(mpiloc(k)),mpi_integer8,mpiloc(k)-1,mpi_comm_world,mpierr)
+         ! broadcast the features
+          call mpi_bcast(compfeat,npoint*nfpcore_this(mpiloc(k)),&
+                                                        mpi_double_precision,mpiloc(k)-1,mpi_comm_world,mpierr)
+
+         ! do the comparision. All the cores of mpiloc(k:mpisize) will be compared with
+         ! that on the core mpiloc(k)-1. 
+         if( all( mpirank/=(mpiloc(:k)-1) ) .and. nfpcore_this(mpirank+1)>0 ) then ! this core vs core mpiloc(k)-1
+            do mpij=1,nfpcore_this(mpiloc(k))  ! core mpiloc(k)-1
+              l=0; ll=order(nfpcore_this(mpirank+1)+1)  ! order stores features in descending order 
+              i=l+ceiling(float(ll-l)/2.0)   ! bisection method
+
+              124 continue
+              simpler_result=simpler(compcomplexity(mpij),complexity(order(i)),compname(mpij),fname(order(i)))
+
+                 ! check for duplication
+                 if(equivalent(compidentity(mpij),fID(order(i)),compfeat(:npoint,mpij),& 
+                               feat(:npoint,order(i))) ) then 
+                       if(simpler_result==1) then
+                          available(order(i))=.false.
+                       else
+                          compidentity(mpij)=0.d0
+                       end if
+                    cycle
+                 end if
+                 ! if not equal
+                 if(compidentity(mpij)>fID(order(i)) .or.  (compidentity(mpij)==fID(order(i)) .and. &
+                    simpler_result==1 ) ) then   ! mpij is better than i
+                    ll=i   ! replace the right end with i to find a new middle point
+                 else
+                     l=i  ! replace the left end with i to findn a new middle point
+                 end if
+                 if(i==l+ceiling(float(ll-l)/2.0)) cycle  ! i is already the left or right end; no more
+                 i=l+ceiling(float(ll-l)/2.0)   ! the new middle point
+                 goto 124
+            end do
+            call mpi_send(compidentity,nfpcore_this(mpiloc(k)),mpi_double_precision,mpiloc(k)-1,33,&
+                          mpi_comm_world,status,mpierr)
+
+         else if(mpirank==mpiloc(k)-1) then
+             do l=1,mpisize
+                 if( any( l==mpiloc(:k) ) .or. nfpcore_this(l)==0 ) cycle
+            call mpi_recv(compidentity,nfpcore_this(mpiloc(k)),mpi_double_precision,mpi_any_source,33,&
+                      mpi_comm_world,status,mpierr)
+               do i=1,nfpcore_this(mpiloc(k))
+                if(abs(compidentity(i))<1d-8) available(i)=.false.
+               end do
+            end do
+         end if
+
+         deallocate(compidentity)
+         deallocate(compname)
+         deallocate(compcomplexity)
+         deallocate(compfeat)
+     end if
+
+  end do
+
+  j=0
+  do i=1,nfpcore_this(mpirank+1)
+   if(available(i)) j=j+1
+  end do
+
+  if(mpirank/=0)  then
+    call mpi_send(j,1,mpi_integer8,0,1,mpi_comm_world,status,mpierr)
+  else
+    do k=1,mpisize-1
+      call mpi_recv(i,1,mpi_integer8,k,1,mpi_comm_world,status,mpierr)
+      j=j+i
+    end do
+  end if
+
+END IF
+
+end subroutine
+
+
+subroutine sure_indep_screening(nfpcore_this,available)
+! sure independence screening
+real*8 tmp,pscore(2,mpisize)
+integer*8 i,j,k,l,ll,mpii,mpij,nfpcore_this(:),loc(2),simpler_result,pcomplexity(mpisize)
+character(len=str_len) pname(mpisize)
+logical available(:),pavailable(mpisize)
+
+
+if(mpirank==0) write(*,'(/a)') 'Sure Independence Screening on selected features ...'
+
+pavailable=.false.
+if(nfpcore_this(mpirank+1)>0) call update_availability(available,pavailable)
+do l=1,mpisize
+ call mpi_bcast(pavailable(l),1,mpi_logical,l-1,mpi_comm_world,mpierr) 
+end do
+
+! selection starts ...
+i=0   ! count of selected features
+do while( any(pavailable) .and. i<nf_sis(iFCDI) )
+
+      i=i+1
+
+      ! find the max score on each core
+      if(nfpcore_this(mpirank+1)>0) then
+        loc(2:2)=maxloc(sel.feat_score(1,:sel.nselect)) ! score_2 is less important or not used
+        tmp=sel.feat_score(1,loc(2))  ! (loc(1) to be used for other purpose
+      end if
+      do l=1,nfpcore_this(mpirank+1)  ! equal scores
+        if(  abs(tmp-sel.feat_score(1,l))<=1d-8 ) then
+           if( sel.feat_score(2,l)-sel.feat_score(2,loc(2))>1d-8 .or. &
+             (abs(sel.feat_score(2,l)-sel.feat_score(2,loc(2)))<=1d-8 .and. &
+             simpler(sel.feat_comp(l),sel.feat_comp(loc(2)),sel.feat_name(l),sel.feat_name(loc(2)))==1) ) loc(2)=l
+        end if
+      end do
+      if(nfpcore_this(mpirank+1)>0) then
+        pscore(:,mpirank+1)=sel.feat_score(:,loc(2))  ! location of max score of this core
+        pcomplexity(mpirank+1)=sel.feat_comp(loc(2))  ! corresponding feature complexity
+        pname(mpirank+1)=sel.feat_name(loc(2))  ! corresponding feature name
+      else
+        pscore(:,mpirank+1)=-1   ! simply a negative value to indicate empty
+      end if
+
+      do l=1,mpisize  ! inform other cores about local best features
+        call mpi_bcast(pscore(:,l),2,mpi_double_precision,l-1,mpi_comm_world,mpierr)
+        call mpi_bcast(pcomplexity(l),1,mpi_integer8,l-1,mpi_comm_world,mpierr)
+        call mpi_bcast(pname(l),str_len,mpi_character,l-1,mpi_comm_world,mpierr)
+      end do
+
+      !find the max score between cores
+      loc(1:1)=maxloc(pscore(1,:))
+      tmp=maxval(pscore(1,:))
+      do l=1,mpisize  ! if equal score between features
+        if(  abs(tmp-pscore(1,l))<=1d-8 ) then
+           if( pscore(2,l)-pscore(2,loc(1))>1d-8 .or.  ( abs(pscore(2,l)-pscore(2,loc(1)))<=1d-8 .and. &
+               simpler(pcomplexity(l),pcomplexity(loc(1)),pname(l),pname(loc(1)))==1 ) ) loc(1)=l
+        end if
+      end do
+
+      ! save the highest-scored feature
+      if((loc(1)-1)==mpirank) then
+         if(mpirank==0) then
+             sis.feat_data(:,i)=sel.feat_data(:,loc(2))
+             sis.feat_name(i)=sel.feat_name(loc(2))
+             sis.feat_score(:,i)=sel.feat_score(:,loc(2))
+         else
+            call mpi_send(sel.feat_data(:,loc(2)),npoint,mpi_double_precision,0,100,mpi_comm_world,status,mpierr)
+            call mpi_send(sel.feat_name(loc(2)),str_len,mpi_character,0,101,mpi_comm_world,status,mpierr)
+            call mpi_send(sel.feat_score(:,loc(2)),2,mpi_double_precision,0,103,mpi_comm_world,status,mpierr)
+         end if
+         available(loc(2))=.false.   ! avoid to be selected again
+         sel.feat_score(:,loc(2))=-1  
+      end if
+
+      if(mpirank==0 .and. mpirank/=(loc(1)-1) ) then
+           call mpi_recv(sis.feat_data(:,i),npoint,mpi_double_precision,loc(1)-1,100,mpi_comm_world,status,mpierr)
+           call mpi_recv(sis.feat_name(i),str_len,mpi_character,loc(1)-1,101,mpi_comm_world,status,mpierr)
+           call mpi_recv(sis.feat_score(:,i),2,mpi_double_precision,loc(1)-1,103,mpi_comm_world,status,mpierr)
+      end if
+      !---
+
+      if(nfpcore_this(mpirank+1)>0) call update_availability(available,pavailable)
+      do l=1,mpisize
+       call mpi_bcast(pavailable(l),1,mpi_logical,l-1,mpi_comm_world,mpierr)
+      end do
+
+end do
+!--------------------
+
+
+nf_sis_avai(iFCDI)=i    ! the actual number of selected features
+
+end subroutine
+
+
+subroutine dup_scheck(num,fID,fname,complexity,feat,order,available)
+! duplication check within each core
+! output the arrays "order" and "available"
+integer*8 i,j,l,ll,order(:),n,num,complexity(:),simpler_result
+real*8 fID(:),feat(:,:)
+character(len=*) fname(:)
+logical available(:)
+
+IF(num==0) THEN
+  n=0
+ELSE
+
+! creating the descending ordering via bisection method 
+order(1)=1  ! initial first entry
+n=1
+do i=2,num
+
+   l=0; ll=n;   ! left and right end
+   j=l+ceiling(float(ll-l)/2.0)  ! the middle one
+
+   123 continue
+   simpler_result=simpler(complexity(i),complexity(order(j)),fname(i),fname(order(j)))
+
+   ! check for duplication
+      if(equivalent(fID(i),fID(order(j)),feat(:npoint,i),feat(:npoint,order(j))) ) then
+           if( simpler_result==1 ) then
+              available(order(j))=.false.
+              order(j)=i
+           else
+              available(i)=.false.
+           end if
+           cycle
+      end if
+     ! if not equal
+     if(fID(i)>fID(order(j)) .or. (fID(i)==fID(order(j)) .and. simpler_result==1 ) ) then  ! i is better
+        ll=j    ! replace the ll with the j to find a new middle point
+        if(j==l+ceiling(float(ll-l)/2.0)) then   ! if j is already the left end
+          order(j+1:n+1)=order(j:n)  
+          order(j)=i   ! insert i to that before the original j
+          n=n+1
+          cycle
+        end if
+      else
+         l=j   ! j is better; replace the left end with j to find a new middle point
+         if(j==l+ceiling(float(ll-l)/2.0)) then  ! if j is already the right end
+           if(n>j) order(j+2:n+1)=order(j+1:n)
+           order(j+1)=i   ! insert i to that after j
+           n=n+1
+           cycle
+         end if
+      end if
+
+      j=l+ceiling(float(ll-l)/2.0)  ! the new middle point
+      goto 123
+end do
+
+END IF
+
+order(num+1)=n  ! store the number of features after check
+if(mpirank/=0)  then
+  call mpi_send(n,1,mpi_integer8,0,1,mpi_comm_world,status,mpierr)
+else
+  do l=1,mpisize-1
+    call mpi_recv(i,1,mpi_integer8,l,1,mpi_comm_world,status,mpierr)
+    n=n+i
+ end do
+end if
+end subroutine
+
+
+function sis_score(feat,yyy)   
+! correlation between a feature 'feat' and the target 'yyy'
+! sis_score returns a vector with 2 elements
+integer i,j,mm1,mm2,mm3,mm4,k,kk,l,overlap_n,nf1,nf2,itask,nconvexpair
+real*8 feat(:),sdfeat(ubound(feat,1)),tmp(ntask),sis_score(2),yyy(:),xnorm(ntask),xmean(ntask),&
+       overlap_length,length_tmp,feat_tmp1(ubound(feat,1)),feat_tmp2(ubound(feat,1)),mindist,minlen
+logical isoverlap
+
+
+if(ptype==1) then  
+     do j=1,ntask
+         mm1=sum(nsample(:j-1))+1
+         mm2=sum(nsample(:j))
+         xmean(j)=sum(feat(mm1:mm2))/nsample(j)
+         sdfeat(mm1:mm2)=feat(mm1:mm2)-xmean(j)  
+         xnorm(j)=sqrt(sum((sdfeat(mm1:mm2))**2))
+         if(xnorm(j)>1d-50) then
+            sdfeat(mm1:mm2)=sdfeat(mm1:mm2)/xnorm(j)  ! standardization to the features
+            tmp(j)=abs(sum(sdfeat(mm1:mm2)*yyy(mm1:mm2))) ! |xy|. Donot use normalized yyy here!
+         else
+            tmp(j)=0.d0
+         end if
+     end do
+   ! score ranges from 0 to 1
+   sis_score(1)=sqrt(sum(tmp**2)/ntask)  ! quadratic mean of the scores of different tasks
+   sis_score(1)=sis_score(1)/sqrt(sum(yyy**2)/ntask)  ! normalization
+   sis_score(2)=1.d0   ! not used
+
+else if(ptype==2) then  
+   mindist=-1d10
+   overlap_n=0
+   overlap_length=0.d0
+   isoverlap=.false.
+   nconvexpair=0
+   do itask=1,ntask
+
+   ! calculate overlap between domains of property i
+        do i=1,ngroup(itask,1000)-1   ! ngroup(itask,1000) store the number of groups in this task
+            if(itask==1) then
+              mm1=sum(ngroup(itask,:i-1))+1
+              mm2=sum(ngroup(itask,:i))
+            else            
+              mm1=sum(nsample(:itask-1))+sum(ngroup(itask,:i-1))+1
+              mm2=sum(nsample(:itask-1))+sum(ngroup(itask,:i))
+            end if
+            nf1=0
+            do k=mm1,mm2
+               if(yyy(k)<0.5) then  ! y=1: classified, y=0: unclassified
+                   nf1=nf1+1
+                   feat_tmp1(nf1)=feat(k) 
+               end if
+            end do       
+          do j=i+1,ngroup(itask,1000) 
+               if(itask==1) then
+                 mm3=sum(ngroup(itask,:j-1))+1
+                 mm4=sum(ngroup(itask,:j))
+               else
+                 mm3=sum(nsample(:itask-1))+sum(ngroup(itask,:j-1))+1
+                 mm4=sum(nsample(:itask-1))+sum(ngroup(itask,:j))
+               end if
+               nf2=0
+               do k=mm3,mm4
+                   if(yyy(k)<0.5) then
+                       nf2=nf2+1
+                       feat_tmp2(nf2)=feat(k) 
+                   end if
+               end do
+               if(isconvex(itask,i)==0 .and. isconvex(itask,j)==0) cycle ! both are not convex domains
+
+               if(isconvex(itask,i)==1 .and. isconvex(itask,j)==1) then
+                  call convex1d_overlap(feat_tmp1(:nf1),feat_tmp2(:nf2),bwidth,k,length_tmp)
+                  overlap_n=overlap_n+k
+                  nconvexpair=nconvexpair+1
+                  if(length_tmp>=0.d0) isoverlap=.true.
+                   ! which feature is shorter
+                   minlen=min(maxval(feat_tmp1(:nf1))-minval(feat_tmp1(:nf1)),maxval(feat_tmp2(:nf2))&
+                          -minval(feat_tmp2(:nf2)))
+                   if(length_tmp<0.d0) then  ! if separated
+                      if(length_tmp>mindist) mindist=length_tmp  ! renew the worst separation
+                   else if(length_tmp>=0.d0 .and. minlen==0.d0) then  ! if overlapped and one feature is 0D
+                      overlap_length=overlap_length+1.d0  ! totally overlapped
+                   else if(length_tmp>=0.d0 .and. minlen>0.d0) then  ! if not separated and no 0D feature
+                      overlap_length=overlap_length+length_tmp/minlen  ! calculate total overlap
+                   end if
+               else if (isconvex(itask,i)==0 .and. isconvex(itask,j)==1) then  !count the number of i-data in j-domain
+                  overlap_n=overlap_n+convex1d_in(feat(mm3:mm4),feat_tmp1(:nf1),bwidth)
+               else if (isconvex(itask,i)==1 .and. isconvex(itask,j)==0) then !count the number of j-data in i-domain
+                  overlap_n=overlap_n+convex1d_in(feat(mm1:mm2),feat_tmp2(:nf2),bwidth)
+               end if
+
+          end do  ! j
+        end do  ! i
+   end do  ! itask
+
+   sis_score(1)=float(overlap_n)  ! >=0, higher sis_score --> worse feature
+   sis_score(1)=1.d0/(sis_score(1)+1.d0)  ! transform to <=1, higher sis_score --> better feature
+   if(isoverlap) then  ! there are domains overlapped
+     sis_score(2)=overlap_length/float(nconvexpair)     ! second metric, <=1, higher --> worse
+     sis_score(2)=1.d0/(sis_score(2)+1.0)  ! transform to <=1, higher --> better
+   else ! separated length
+     sis_score(2)=-mindist   ! separated length between domains,positive
+   end if
+
+end if
+end function
+
+
+subroutine isgoodf(feat_data,feat_name,lastop,feat_comp,feat_unit,nf)
+real*8 feat_data(:),feat_unit(:)
+character(len=*) feat_name,lastop
+integer*8 nf,feat_comp
+
+if(goodf(feat_data,feat_name,feat_unit,feat_comp)) then
+  nf=nf+1
+  if(icomb < rung) then
+     if(nf>ubound(gen.feat_data,2)) call addm_gen(int8(ceiling(100000.0/mpisize)))
+     gen.feat_data(:,nf)=feat_data
+     gen.feat_name(nf)=feat_name
+     gen.lastop(nf)=lastop
+     gen.feat_comp(nf)=feat_comp
+     gen.feat_unit(:,nf)=feat_unit
+  end if
+end if
+end subroutine
+
+
+subroutine update_select
+! update the selected space
+! bisection method for descending order
+integer*8 i,j,k,l,ll,order(sel.nselect),n,tmp,tmpcomplexity(nbasic_select),simpler_result
+real*8 tmpf(ubound(sel.feat_data,1),nbasic_select),tmpscore(2,nbasic_select)
+character(len=str_len) tmpname(nbasic_select)
+
+! creating an ordering from large to small
+order(1)=1   ! assuming the first feature being the best (highest score)
+n=1  ! count of features saved in 'order'
+
+do i=2,sel.nselect   ! compare feaure i with the j located at middle of order(1:n)
+
+  l=0; ll=n;
+  j=l+ceiling(float(ll-l)/2.0)   ! j is at middle between l and ll.
+ 
+  125 continue
+  simpler_result=simpler(sel.feat_comp(i),sel.feat_comp(order(j)),sel.feat_name(i),sel.feat_name(order(j)))
+
+  ! get rid of duplicated features
+  if(equivalent(sel.feat_score(1,i),sel.feat_score(1,order(j)),sel.feat_data(:npoint,i),&
+                sel.feat_data(:npoint,order(j))) ) then
+        if( simpler_result==1) order(j)=i  ! update the order
+        cycle
+  end if
+
+  ! bisection method for descending order
+  if( (sel.feat_score(1,i)>sel.feat_score(1,order(j))) .or.  ((sel.feat_score(1,i)==sel.feat_score(1,order(j))) &
+       .and. sel.feat_score(2,i)>sel.feat_score(2,order(j)) ) .or.  ((sel.feat_score(1,i)==sel.feat_score(1,order(j))) &
+       .and. sel.feat_score(2,i)==sel.feat_score(2,order(j)) .and. simpler_result==1) )then 
+      ll=j   ! i is better. Replace the right end ll with j, and find a new middle point
+      if(j==l+ceiling(float(ll-l)/2.0)) then  ! if j is already the left end
+        order(j+1:n+1)=order(j:n)   ! move all the original j:n features by 1 step
+        order(j)=i ! insert the i right before the j, because score_i > score_j
+        n=n+1  ! size increased by 1
+        cycle
+      end if
+   else   ! j is better. Replace the left end l with j, and find a new middle point
+       l=j
+       if(j==l+ceiling(float(ll-l)/2.0)) then
+         if(n>j) order(j+2:n+1)=order(j+1:n)
+         order(j+1)=i
+         n=n+1
+         cycle
+       end if
+   end if
+
+   j=l+ceiling(float(ll-l)/2.0)  ! the new middle point
+   goto 125
+end do
+
+sel.nselect=min(n,nbasic_select)   ! reduce the space by removing features with low ranking
+
+! reordering
+do i=1,sel.nselect
+tmpf(:,i)=sel.feat_data(:,order(i))
+tmpcomplexity(i)=sel.feat_comp(order(i))
+tmpname(i)=sel.feat_name(order(i))
+tmpscore(:,i)=sel.feat_score(:,order(i))
+end do
+
+! update the selected features
+score_threshold=tmpscore(1,sel.nselect)
+sel.feat_data(:,:sel.nselect)=tmpf(:,:sel.nselect)
+sel.feat_comp(:sel.nselect)=tmpcomplexity(:sel.nselect)
+sel.feat_name(:sel.nselect)=tmpname(:sel.nselect)
+sel.feat_score(:,:sel.nselect)=tmpscore(:,:sel.nselect)
+
+end subroutine
+
+function ffcorr(feat1,feat2)
+! calculate the correlation coefficient between two features
+! -1=< ffcorr <= 1
+real*8 ffcorr,feat1(:),feat2(:),mean1,mean2,sd1,sd2
+
+mean1=sum(feat1)/float(npoint)
+mean2=sum(feat2)/float(npoint)
+sd1=sqrt((sum((feat1-mean1)**2))/float(npoint))
+sd2=sqrt((sum((feat2-mean2)**2))/float(npoint))
+ffcorr=(sum((feat1-mean1)*(feat2-mean2)))/float(npoint)/sd1/sd2
+if(ffcorr>1.d0) ffcorr=1.d0    ! avoid numerical noise
+if(ffcorr<-1.d0) ffcorr=-1.d0  
+end function
+
+
+function equivalent(score1,score2,feat1,feat2)
+! check if two features are the same or highly correlated.
+real*8 score1,score2,diff1,diff2,feat1(:),feat2(:)
+logical equivalent
+
+equivalent=.false.
+diff1=score1-score2
+if(abs(diff1)>1d-8) return
+
+if( abs(ffcorr(feat1,feat2))> 0.99 ) equivalent=.true.
+
+end function
+
+
+end module
+
diff --git a/src/FCse.f90 b/src/FCse.f90
new file mode 100644
index 0000000..70a58f3
--- /dev/null
+++ b/src/FCse.f90
@@ -0,0 +1,1894 @@
+! Licensed under the Apache License, Version 2.0 (the "License");
+! you may not use this file except in compliance with the License.
+! You may obtain a copy of the License at
+!
+!     http://www.apache.org/licenses/LICENSE-2.0
+!
+! Unless required by applicable law or agreed to in writing, software
+! distributed under the License is distributed on an "AS IS" BASIS,
+! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+! See the License for the specific language governing permissions and
+! limitations under the License.
+
+
+module FCse
+! feature construction
+!----------------------------------------------------------------------
+
+use var_global
+use libsisso
+implicit none
+! variables used by subroutines and functions in this module
+
+type Sexpression
+  integer list_id(Smaxlen),list_len,list_pointer(2,Smaxlen)  ! A list for each S-expression
+  character list_var(Smaxlen)*30,list_op(Smaxlen)*10
+end type
+
+type feature
+  type(Sexpression),allocatable:: Sexpr(:)
+  character(len=str_len),allocatable:: feat_name(:),lastop(:)*10
+  integer*8,allocatable:: feat_comp(:)
+  real*8,allocatable:: feat_unit(:,:)
+end type
+
+type feature_selected
+  integer*8 nselect
+  type(Sexpression),allocatable:: Sexpr(:)
+  real*8,allocatable:: feat_score(:,:)
+  character(len=str_len),allocatable:: feat_name(:)
+  integer*8,allocatable:: feat_comp(:)
+end type
+
+type feature_sis
+  type(Sexpression),allocatable:: Sexpr(:)
+  real*8,allocatable:: feat_score(:,:)
+  character(len=str_len),allocatable:: feat_name(:)
+end type
+
+type(feature) gen
+type(feature_selected) sel
+type (feature_sis) sis
+
+integer*8 ntot,nthis,nreject,nbasic_select,nextra_select,icomb
+real*8 score_threshold
+real*8,allocatable:: trainy(:),trainy_c(:)
+character(len=str_len),allocatable::  reject(:)
+
+contains
+
+
+subroutine feature_construction_se
+implicit none
+integer   loc(1),ioerr
+character line*500,phiname*5,reject_file_name*100,superline*(20*(1+sum(nf_sis(:desc_dim))))
+real*8    bbb,ccc,aaa(mpisize),tag(npoint)
+integer*8 i,j,k,l,ll,mm1,mm2,nf(20),mpii,mpij,mpik,total_comb,nfpcore_this(mpisize),mpin(mpisize),mpin2(mpisize)
+real*8,allocatable:: fID(:),tmp_data(:,:)
+integer*8,allocatable:: order(:)
+character mpisync*1
+logical,allocatable:: available(:)
+type(feature) inp
+
+mpisync='Y'
+
+! Stop if the whole feature space had been selected.
+IF (iFCDI>1 .and. nf_sis_avai(iFCDI-1)<nf_sis(iFCDI-1)) THEN
+   if(mpirank==0) then
+      write(*,'(a)') 'The whole feature-space has been selected! No more FC will be performed.' 
+      write(9,'(a)') 'The whole feature-space has been selected! No more FC will be performed.'
+   end if
+   return
+END IF
+
+! running time by FC
+if(mpirank==0) mytime.sFC=mpi_wtime()
+
+!------------------------------------------------------
+allocate(trainy(npoint))
+allocate(trainy_c(npoint))
+i=max(1000,nsf)
+allocate(inp.Sexpr(i))   
+allocate(inp.feat_name(i))
+allocate(inp.lastop(i))
+allocate(inp.feat_unit(nunit,i))
+allocate(inp.feat_comp(i))
+
+allocate(gen.Sexpr(i))
+allocate(gen.feat_name(i))
+allocate(gen.lastop(i))
+allocate(gen.feat_comp(i))
+allocate(gen.feat_unit(nunit,i))
+
+nbasic_select=nf_sis(iFCDI)  ! parallel
+if(mpisize==1) nbasic_select=2*nf_sis(iFCDI)  ! serial
+nextra_select=min(nf_sis(iFCDI),50000)
+j=nbasic_select+nextra_select
+
+allocate(sel.Sexpr(j))
+allocate(sel.feat_score(2,j))
+allocate(sel.feat_name(j))
+allocate(sel.feat_comp(j))
+
+! initialization
+!------------------
+trainy=res
+score_threshold=-1.0
+inp.feat_unit(:,:nsf)=feature_units
+inp.feat_name(:nsf)=pfname(:nsf)
+inp.lastop=''     ! the last operation of the generated feature
+inp.feat_comp=0
+
+do i=1,nsf    
+  inp.Sexpr(i).list_id(1)=1   ! first entry of the list of expression i
+  inp.Sexpr(i).list_len=1   ! current list length
+  inp.Sexpr(i).list_op(1)='var'  ! is variable, no operation
+  inp.Sexpr(i).list_pointer(:,1)=0   ! the 2 operands specified by the pointer. No operand for variable
+  inp.Sexpr(i).list_var(1)=trim(adjustl(pfname(i)))  ! the variable name
+end do
+
+
+! The 'tag' to be used by 'fID' during redundant check
+do j=1,ntask
+do i=1,nsample(j)    
+  tag(sum(nsample(:j-1))+i)=1.0d0+0.001d0*i
+end do
+end do
+
+! avoid duplication in the selected space
+if(iFCDI>1) reject_file_name='SIS_subspaces/Uspace.expressions'
+
+nreject=0
+if(iFCDI >1 .and. mpirank==0) then
+   open(fileunit,file=trim(adjustl(reject_file_name)),status='old')
+
+   ! find the number of expressions in the file
+   do while(.true.)
+      read(fileunit,*,iostat=ioerr)
+      if(ioerr<0) exit
+      nreject=nreject+1
+   end do
+
+   if(nreject>0) allocate(reject(nreject))
+   rewind(fileunit)
+   
+   do j=1,nreject
+      read(fileunit,'(a)') line
+      call string_split(line,reject(j:j),' ')
+      reject(j)=adjustl(reject(j))
+   end do
+   close(fileunit)
+   !ordering
+   do i=1,nreject-1
+       loc(1)=i
+       do j=i+1,nreject
+         if(reject(j)>reject(loc(1))) loc(1)=j
+       end do
+       if(loc(1)>i) then
+         line=trim(reject(i))
+         reject(i)=reject(loc(1))
+         reject(loc(1))=trim(line)
+       end if
+   end do
+end if
+
+call mpi_bcast(nreject,1,mpi_integer8,0,mpi_comm_world,mpierr)
+if(mpirank/=0 .and. nreject>0) allocate(reject(nreject))
+if(nreject>0) call mpi_bcast(reject,nreject*str_len,mpi_character,0,mpi_comm_world,mpierr)
+!---------------------------------
+! Population Standard Deviation
+!---------------------------------
+
+if(ptype==1) then
+   do j=1,ntask
+    mm1=sum(nsample(:j-1))+1
+    mm2=sum(nsample(:j))
+    trainy_c(mm1:mm2)= trainy(mm1:mm2)-sum(trainy(mm1:mm2))/(mm2-mm1+1)   ! centered
+   end do
+end if
+
+!---------------------------------------------------
+! feature construction start ...
+! Phi0 is the initial space with primary features
+!---------------------------------------------------
+sel.nselect=0 ! number of selected features
+nf=0      ! number of generated features from each combination
+i=nsf ! total number of primary features
+j=0
+
+! no combination, just primary features
+total_comb=0
+call combine_se(inp,1,i,0,0,'NO',j,total_comb) 
+! ntot is forced to equal total number of pf as all are already stored in inp.feat, yet sel.nselect is not
+! necessarily to be ntot if some pf is not good (e.g. constant).
+! The purpose here is store good primary features in sel.feat
+
+ntot=nsf    ! total number of features
+nthis=ntot  ! number of features generated from this combine_se()
+write(phiname,'(a,i2.2)') 'Phi',0
+if(mpirank==0)  call writeout_se(phiname,sel.nselect,ntot)  
+
+if(rung==0) then
+  deallocate(gen.Sexpr)
+  deallocate(gen.feat_name)
+  deallocate(gen.lastop)
+  deallocate(gen.feat_comp)
+  deallocate(gen.feat_unit)
+end if
+
+!-------------------------
+!creating PHI1, PHI2, ...
+!-------------------------
+do icomb=1,rung
+    if(mpirank==0) write(*,'(/a,i2.2,a)') 'Generating Phi',icomb,' ...'
+
+    call mpi_barrier(mpi_comm_world,mpierr)
+
+    if(icomb==rung) then
+       deallocate(gen.Sexpr)
+       deallocate(gen.feat_name)
+       deallocate(gen.lastop)
+       deallocate(gen.feat_comp)
+       deallocate(gen.feat_unit)
+    end if
+
+    ! allocating equivalent workfload for each core
+    ! Total jobs: T=N*M + N(N-1)/2, where N=nthis, M = ntot-nthis
+    ! Divide N into n parts: X1, X2, ..., Xi, ..., where n=mpisize
+    ! X1=X1(X1-1)/2+X1(N-X1+M) = T/n --> X1
+    ! X2=X2(X2-1)/2+X2(N-X1-X2+M) = T/n --> X2
+    ! Xi=Xi(Xi-1)/2+Xi(N-X1-X2-...-Xi+M) = T/n -->Xi
+    ! Let bbb=N+M-1/2-sum(X1+X2+...+X{i-1}),ccc=-(N*M+N(N-1)/2)/n
+    ! Xi= bbb-sqrt(bbb^2+2ccc)
+    ! mpin: store the Xi, mpin2: starting position of the Xi part in nthis
+
+      mpin=0
+      mpin2=0
+      ccc=-(nthis*(ntot-nthis)+nthis*(nthis-1)/2.d0)/float(mpisize)
+      do mpii=1,mpisize
+         bbb=ntot-0.5-sum(aaa(:mpii-1))
+         if((bbb**2+2*ccc)>0) then
+            aaa(mpii)=bbb-sqrt(bbb**2+2*ccc)
+            mpin(mpii)=ceiling(aaa(mpii))
+            if (sum(mpin(:mpii))>nthis) then
+                  mpin(mpii)=nthis-sum(mpin(:mpii-1))
+                  exit
+            end if
+         end if
+      end do
+
+      if(sum(mpin)<nthis) then  ! it can happen when ntot==nthis
+         mpin(mpisize)=mpin(mpisize)+nthis-sum(mpin)
+      end if
+
+      do i=1,mpisize
+        if(mpin(i)>0) mpin2(i)=sum(mpin(1:i-1))+1
+      end do
+
+     ! check if array size need to be increased
+      i=ntot-nthis+(nthis-mpin2(mpirank+1)+1)   ! total number of features to be stored in this core
+      if(ubound(inp.Sexpr,1)< i)  call addm_inp_se(i-ubound(inp.Sexpr,1),inp)
+
+     ! broadcast ntot-nthis
+       if(ntot>nthis) then
+        i=ntot-nthis
+        do ll=1,i
+          call mpi_bcast(inp.Sexpr(ll).list_id(:Smaxlen),Smaxlen,mpi_integer,0,mpi_comm_world,mpierr)
+          call mpi_bcast(inp.Sexpr(ll).list_len,1,mpi_integer,0,mpi_comm_world,mpierr)
+          call mpi_bcast(inp.Sexpr(ll).list_pointer(:2,:Smaxlen),Smaxlen*2,mpi_integer,0,mpi_comm_world,mpierr)
+          call mpi_bcast(inp.Sexpr(ll).list_var(:Smaxlen),Smaxlen*30,mpi_character,0,mpi_comm_world,mpierr)
+          call mpi_bcast(inp.Sexpr(ll).list_op(:Smaxlen),Smaxlen*10,mpi_character,0,mpi_comm_world,mpierr)
+        end do
+        call mpi_bcast(inp.feat_name(:i),i*str_len,mpi_character,0,mpi_comm_world,mpierr)
+        call mpi_bcast(inp.lastop(:i),i*10,mpi_character,0,mpi_comm_world,mpierr)
+        call mpi_bcast(inp.feat_unit(:,:i),i*nunit,mpi_double_precision,0,mpi_comm_world,mpierr)
+        call mpi_bcast(inp.feat_comp(:i),i,mpi_integer8,0,mpi_comm_world,mpierr)
+       end if
+
+      if(mpirank==0) then  ! send nthis 
+        do i=2,mpisize
+          j=ntot-nthis+mpin2(i)  ! starting position
+          k=nthis-mpin2(i)+1   ! size
+          if(mpin(i)>0) then
+          do ll=j,ntot
+           call mpi_send(inp.Sexpr(ll).list_id(:Smaxlen),Smaxlen,mpi_integer,i-1,5,mpi_comm_world,status,mpierr)
+           call mpi_send(inp.Sexpr(ll).list_len,1,mpi_integer,i-1,6,mpi_comm_world,status,mpierr)
+           call mpi_send(inp.Sexpr(ll).list_pointer(:2,:Smaxlen),Smaxlen*2,mpi_integer,i-1,7,mpi_comm_world,status,mpierr)
+           call mpi_send(inp.Sexpr(ll).list_var(:Smaxlen),Smaxlen*30,mpi_character,i-1,8,mpi_comm_world,status,mpierr)
+           call mpi_send(inp.Sexpr(ll).list_op(:Smaxlen),Smaxlen*10,mpi_character,i-1,9,mpi_comm_world,status,mpierr)
+           call mpi_recv(mpisync,1,mpi_character,i-1,10,mpi_comm_world,status,mpierr)
+          end do
+          call mpi_send(inp.feat_name(j:ntot),k*str_len,mpi_character,i-1,1,mpi_comm_world,status,mpierr)
+          call mpi_send(inp.lastop(j:ntot),k*10,mpi_character,i-1,2,mpi_comm_world,status,mpierr)
+          call mpi_send(inp.feat_comp(j:ntot),k,mpi_integer8,i-1,3,mpi_comm_world,status,mpierr)
+          call mpi_send(inp.feat_unit(:,j:ntot),k*nunit,mpi_double_precision,i-1,4,mpi_comm_world,status,mpierr)
+          end if
+        end do
+      else  ! receive
+        j=ntot-nthis+1  ! start
+        l=ntot-nthis+(nthis-mpin2(mpirank+1)+1)  ! end
+        k=nthis-mpin2(mpirank+1)+1  ! size
+        if(mpin(mpirank+1)>0) then
+          do ll=j,l
+           call mpi_recv(inp.Sexpr(ll).list_id(:Smaxlen),Smaxlen,mpi_integer,0,5,mpi_comm_world,status,mpierr)
+           call mpi_recv(inp.Sexpr(ll).list_len,1,mpi_integer,0,6,mpi_comm_world,status,mpierr)
+           call mpi_recv(inp.Sexpr(ll).list_pointer(:2,:Smaxlen),Smaxlen*2,mpi_integer,0,7,mpi_comm_world,status,mpierr)
+           call mpi_recv(inp.Sexpr(ll).list_var(:Smaxlen),Smaxlen*30,mpi_character,0,8,mpi_comm_world,status,mpierr)
+           call mpi_recv(inp.Sexpr(ll).list_op(:Smaxlen),Smaxlen*10,mpi_character,0,9,mpi_comm_world,status,mpierr)
+           call mpi_send(mpisync,1,mpi_character,0,10,mpi_comm_world,status,mpierr)
+          end do
+          call mpi_recv(inp.feat_name(j:l),k*str_len,mpi_character,0,1,mpi_comm_world,status,mpierr)
+          call mpi_recv(inp.lastop(j:l),k*10,mpi_character,0,2,mpi_comm_world,status,mpierr)
+          call mpi_recv(inp.feat_comp(j:l),k,mpi_integer8,0,3,mpi_comm_world,status,mpierr)
+          call mpi_recv(inp.feat_unit(:,j:l),k*nunit,mpi_double_precision,0,4,mpi_comm_world,status,mpierr)
+        end if
+      end if
+
+      ! feature combination
+      i=ntot-nthis+1  ! start1
+      j=ntot-nthis+mpin(mpirank+1)  ! end1
+      k=ntot-nthis+(nthis-mpin2(mpirank+1)+1) ! end2
+      total_comb=mpin(mpirank+1)*(ntot-nthis)+(mpin(mpirank+1)-1)*mpin(mpirank+1)/2.0+&
+                 mpin(mpirank+1)*(nthis-mpin2(mpirank+1)+1-mpin(mpirank+1))+mpin(mpirank+1) ! binary+unary
+
+      if(mpin(mpirank+1)>0) &
+      call combine_se(inp,i,j,1,k,trim(adjustl(ops(icomb))),nf(icomb),total_comb)
+      call mpi_barrier(mpi_comm_world,mpierr)
+
+  IF (icomb < rung) THEN
+
+       !---------------------------
+       ! redundant check
+       !---------------------------
+       ! create nfpcore_this: number of new features generated in each CPU core
+       if(mpirank/=0) then
+          call mpi_send(nf(icomb),1,mpi_integer8,0,1,mpi_comm_world,status,mpierr)
+       else
+          nfpcore_this(1)=nf(icomb)
+          do mpii=1,mpisize-1
+               call mpi_recv(nfpcore_this(mpii+1),1,mpi_integer8,mpii,1,mpi_comm_world,status,mpierr)
+          end do
+       end if
+       call mpi_bcast(nfpcore_this,mpisize,mpi_integer8,0,mpi_comm_world,mpierr)
+
+       ! creating a unique scalar number fID for each feature
+       allocate(fID(nf(icomb))) 
+       do i=1,nf(icomb)
+          fID(i)=sqrt(sum((tag+evaluator_se(gen.Sexpr(i)))**2))
+       end do
+
+       ! create the "available" to store the repetition information.
+        allocate(available(nf(icomb)))
+        available=.true.
+       ! create the "order" to store the ordering information for bisection method.
+        allocate(order(nf(icomb)+1))
+ 
+       !----------------------------------------------------------------------------------------
+       ! redundant check inside each core: creating the "order" and "available".
+       if(mpirank==0) then
+            write(*,'(a,i15)') 'Number of newly generated features: ',sum(nfpcore_this) 
+            write(*,'(a)') 'Redundant check on the newly generated features ...'
+       end if
+        
+       ! inside each core
+       call dup_scheck_se(nfpcore_this(mpirank+1),fID,gen.feat_name,gen.feat_comp,gen.Sexpr,order,available)
+       ! between cores
+       if(mpisize>1) call dup_pcheck_se(nfpcore_this,fID,gen.feat_name,gen.feat_comp,gen.Sexpr,order,available)
+       !-----------------------------------------------------------------------------------------
+
+
+       !---------------------------
+       ! remove duplicated features
+       !---------------------------
+       j=0
+       do i=1,nf(icomb)
+          if(available(i)) then
+             j=j+1
+             gen.Sexpr(j)=gen.Sexpr(i)
+             gen.feat_name(j)=gen.feat_name(i)
+             gen.lastop(j)=gen.lastop(i)
+             gen.feat_comp(j)=gen.feat_comp(i)
+             gen.feat_unit(:,j)=gen.feat_unit(:,i)
+             fID(j)=fID(i)
+          end if
+       end do
+       nf(icomb)=j
+
+       !renew nfpcore_this
+       if(mpirank/=0) then
+          call mpi_send(nf(icomb),1,mpi_integer8,0,1,mpi_comm_world,status,mpierr)
+       else
+          nfpcore_this(1)=nf(icomb)
+          do mpii=1,mpisize-1
+               call mpi_recv(nfpcore_this(mpii+1),1,mpi_integer8,mpii,1,mpi_comm_world,status,mpierr)
+          end do
+       end if
+       call mpi_bcast(nfpcore_this,mpisize,mpi_integer8,0,mpi_comm_world,mpierr)
+
+       nthis=sum(nfpcore_this)
+       ntot=ntot+nthis
+       if(mpirank==0) then
+           write(*,'(a,i15)') 'Number of newly generated features after redundant check: ',nthis
+       end if
+      !--------------------------------------------------------------------
+      ! cp results of gen.feat in all cores to the inp.feat in core0
+      !--------------------------------------------------------------------
+       i=nf(icomb)
+       if(mpirank>0 .and. i>0) then
+          do ll=1,i
+           call mpi_send(gen.Sexpr(ll).list_id(:Smaxlen),Smaxlen,mpi_integer,0,5,mpi_comm_world,status,mpierr)
+           call mpi_send(gen.Sexpr(ll).list_len,1,mpi_integer,0,6,mpi_comm_world,status,mpierr)
+           call mpi_send(gen.Sexpr(ll).list_pointer(:2,:Smaxlen),Smaxlen*2,mpi_integer,0,7,mpi_comm_world,status,mpierr)
+           call mpi_send(gen.Sexpr(ll).list_var(:Smaxlen),Smaxlen*30,mpi_character,0,8,mpi_comm_world,status,mpierr)
+           call mpi_send(gen.Sexpr(ll).list_op(:Smaxlen),Smaxlen*10,mpi_character,0,9,mpi_comm_world,status,mpierr)
+           call mpi_recv(mpisync,1,mpi_character,0,10,mpi_comm_world,status,mpierr)
+          end do
+          call mpi_send(gen.feat_name(:i),i*str_len,mpi_character,0,1,mpi_comm_world,status,mpierr)
+          call mpi_send(gen.lastop(:i),i*10,mpi_character,0,2,mpi_comm_world,status,mpierr)
+          call mpi_send(gen.feat_comp(:i),i,mpi_integer8,0,3,mpi_comm_world,status,mpierr)
+          call mpi_send(gen.feat_unit(:,:i),nunit*i,mpi_double_precision,0,4,mpi_comm_world,status,mpierr)
+       else if(mpirank==0) then
+          if(ntot>ubound(inp.Sexpr,1)) call addm_inp_se(ntot-ubound(inp.Sexpr,1),inp)
+          ! from core0 to core0
+          inp.Sexpr(ntot-nthis+1:ntot-nthis+i)=gen.Sexpr(:i)
+          inp.lastop(ntot-nthis+1:ntot-nthis+i)=gen.lastop(:i)
+          inp.feat_comp(ntot-nthis+1:ntot-nthis+i)=gen.feat_comp(:i)
+          inp.feat_name(ntot-nthis+1:ntot-nthis+i)=gen.feat_name(:i)
+          inp.feat_unit(:,ntot-nthis+1:ntot-nthis+i)=gen.feat_unit(:,:i)
+          ! from all other cores to core0
+          do mpii=1,mpisize-1
+              j=ntot-nthis+sum(nfpcore_this(:mpii))+1    ! start
+              k=ntot-nthis+sum(nfpcore_this(:mpii+1))    ! end
+              l=nfpcore_this(mpii+1)     ! size
+              if(l>0) then
+              do ll=j,k
+         call mpi_recv(inp.Sexpr(ll).list_id(:Smaxlen),Smaxlen,mpi_integer,mpii,5,mpi_comm_world,status,mpierr)
+         call mpi_recv(inp.Sexpr(ll).list_len,1,mpi_integer,mpii,6,mpi_comm_world,status,mpierr)
+         call mpi_recv(inp.Sexpr(ll).list_pointer(:2,:Smaxlen),Smaxlen*2,mpi_integer,mpii,7,mpi_comm_world,status,mpierr)
+         call mpi_recv(inp.Sexpr(ll).list_var(:Smaxlen),Smaxlen*30,mpi_character,mpii,8,mpi_comm_world,status,mpierr)
+         call mpi_recv(inp.Sexpr(ll).list_op(:Smaxlen),Smaxlen*10,mpi_character,mpii,9,mpi_comm_world,status,mpierr)
+         call mpi_send(mpisync,1,mpi_character,mpii,10,mpi_comm_world,status,mpierr)
+              end do
+              call mpi_recv(inp.feat_name(j:k),l*str_len,mpi_character,mpii,1,mpi_comm_world,status,mpierr)
+              call mpi_recv(inp.lastop(j:k),l*10,mpi_character,mpii,2,mpi_comm_world,status,mpierr)
+              call mpi_recv(inp.feat_comp(j:k),l,mpi_integer8,mpii,3,mpi_comm_world,status,mpierr)
+              call mpi_recv(inp.feat_unit(:,j:k),nunit*l,mpi_double_precision,mpii,4,mpi_comm_world,status,mpierr)
+              end if
+         end do
+        end if
+
+       !----------------------------------------------------------------
+       ! collect the newly selected features from each core to mpirank0
+       !----------------------------------------------------------------
+       if(mpirank/=0) then
+         call mpi_send(sel.nselect,1,mpi_integer8,0,5,mpi_comm_world,status,mpierr)
+       else
+         mpik=sel.nselect
+         do mpii=1,mpisize-1
+           call mpi_recv(mpij,1,mpi_integer8,mpii,5,mpi_comm_world,status,mpierr)
+           mpik=mpik+mpij  ! count the total number of selected features
+         end do
+       end if
+       !--------------------------------------
+       ! print the space information
+       !--------------------------------------
+        if(mpirank==0) then
+          write(phiname,'(a,i2.2)') 'Phi',icomb
+          call writeout_se(phiname,mpik,ntot)
+        end if
+       ! delete useless space
+       deallocate(order)
+       deallocate(available)
+       deallocate(fID)
+  END IF
+
+end do
+! -------- end of feature combination ------
+
+! release the spaces
+deallocate(inp.Sexpr)
+deallocate(inp.feat_name)
+deallocate(inp.lastop)
+deallocate(inp.feat_comp)
+deallocate(inp.feat_unit)
+
+!---------------------------------------------------------------
+!  collect the information of selected features from all cores
+!---------------------------------------------------------------
+
+if(rung==0) then
+ nfpcore_this=sel.nselect
+else
+ if(mpirank/=0) then
+     call mpi_send(nf(rung),1,mpi_integer8,0,1,mpi_comm_world,status,mpierr)
+     call mpi_send(sel.nselect,1,mpi_integer8,0,2,mpi_comm_world,status,mpierr)
+ else
+      nfpcore_this(1)=sel.nselect
+      do mpii=1,mpisize-1
+           call mpi_recv(mpij,1,mpi_integer8,mpii,1,mpi_comm_world,status,mpierr)
+           nf(rung)=nf(rung)+mpij
+           call mpi_recv(nfpcore_this(mpii+1),1,mpi_integer8,mpii,2,mpi_comm_world,status,mpierr)
+      end do
+     write(phiname,'(a,i2.2)') 'Phi',rung
+     write(*,'(a,i15)') 'Total number of newly generated features: ',nf(rung)
+     call writeout_se(phiname,sum(nfpcore_this),ntot+nf(rung))
+ end if
+end if
+
+call mpi_bcast(nfpcore_this,mpisize,mpi_integer8,0,mpi_comm_world,mpierr)
+call mpi_barrier(mpi_comm_world,mpierr)
+!-----------------------------------------------------------------------------
+
+! redundant check for selected features
+!---------------------------------------
+allocate(available(nfpcore_this(mpirank+1)))
+available=.true.
+allocate(order(nfpcore_this(mpirank+1)+1))
+
+if(mpirank==0) write(*,'(/a)') 'Redundant check on selected features ...'
+
+! serial redundant check
+call dup_scheck_se(nfpcore_this(mpirank+1),sel.feat_score(1,:),sel.feat_name,sel.feat_comp,sel.Sexpr,order,available)
+
+! parallel redundant check
+if(mpisize>1) call dup_pcheck_se(nfpcore_this,sel.feat_score(1,:),sel.feat_name,sel.feat_comp,sel.Sexpr,order,available)
+
+!---------------------------------------
+! sure independence screening
+!---------------------------------------
+if(mpirank==0) then
+    allocate(sis.Sexpr(nf_sis(iFCDI)))
+    allocate(sis.feat_name(nf_sis(iFCDI)))
+    allocate(sis.feat_score(2,nf_sis(iFCDI)))
+end if
+
+! selecting the best features from sel.XXX to sis.XXX
+call sure_indep_screening_se(nfpcore_this,available)
+
+! output the selected feature space
+!--------------------------------------
+if(mpirank==0) then
+   ! output the space.expressions
+   if(iFCDI==1) then
+     open(fileunit,file='SIS_subspaces/Uspace.expressions',status='replace')
+   else
+     open(fileunit,file='SIS_subspaces/Uspace.expressions',position='append',status='old')
+   end if
+   if(ptype==1) then
+      do i=1,nf_sis_avai(iFCDI)
+        write(fileunit,'(2a,f12.4)') trim(sis.feat_name(i)),'     SIS_score =',sis.feat_score(1,i)
+      end do
+   else if(ptype==2) then
+      do i=1,nf_sis_avai(iFCDI)
+        sis.feat_score(1,i)=1.d0/sis.feat_score(1,i)-1.d0   ! score 1: overlap_n, score 2: normalized overlap_length
+        if(abs(sis.feat_score(2,i))>1d9) then
+           write(fileunit,'(2a,i6,a,f12.4)') trim(sis.feat_name(i)),'    N_overlap =',nint(sis.feat_score(1,i))        
+        else
+            if( nint(sis.feat_score(1,i))/=0 ) then  ! overlapped
+               sis.feat_score(2,i)=1.d0/sis.feat_score(2,i)-1.d0  ! overlapped length
+            else   
+               sis.feat_score(2,i)=-sis.feat_score(2,i)  ! separation distance
+            end if
+           write(fileunit,'(2a,i6,a,f12.4)') trim(sis.feat_name(i)),'    N_overlap =',nint(sis.feat_score(1,i)),&
+                                                                   '    S_overlap =',sis.feat_score(2,i)
+        end if
+      end do
+   end if
+   close(fileunit)
+
+     allocate(tmp_data(npoint,nf_sis_avai(iFCDI)))
+     do j=1,nf_sis_avai(iFCDI)
+        tmp_data(:,j)=evaluator_se(sis.Sexpr(j))
+     end do
+   ! output the data
+2001  format(*(e20.10))
+     do j=1,ntask
+        mm1=sum(nsample(:j-1))+1
+        mm2=sum(nsample(:j))
+        if(ntask>1) then
+            write(line,'(a,i3.3,a)') 'Uspace_t',j,'.dat'
+        else
+            write(line,'(a)') 'Uspace.dat'
+        end if
+        
+        if(iFCDI>1) then
+            call rename('SIS_subspaces/'//trim(adjustl(line)),'SIS_subspaces/'//trim(adjustl(line))//'_tmp')
+            open(fileunit+1,file='SIS_subspaces/'//trim(adjustl(line))//'_tmp',status='old')
+        end if
+        open(fileunit,file='SIS_subspaces/'//trim(adjustl(line)),status='replace')
+
+        do ll=mm1,mm2
+           superline=''
+           if(iFCDI==1) then
+             if(ptype==1) then
+                  write(fileunit,2001) trainy(ll),tmp_data(ll,:nf_sis_avai(iFCDI))
+             elseif(ptype==2) then
+                  write(fileunit,2001) tmp_data(ll,:nf_sis_avai(iFCDI))
+             endif
+           elseif(iFCDI>1) then
+             read(fileunit+1,'(a)') superline
+             if(ptype==1) then
+               write(superline(20*(sum(nf_sis_avai(:iFCDI-1))+1)+1:),2001) tmp_data(ll,:nf_sis_avai(iFCDI)) 
+             elseif(ptype==2) then
+               write(superline(20*(sum(nf_sis_avai(:iFCDI-1)))+1:),2001) tmp_data(ll,:nf_sis_avai(iFCDI))
+             endif
+             write(fileunit,'(a)') trim(superline)
+          end if
+        end do
+        close(fileunit)
+        if(iFCDI>1) close(fileunit+1,status='delete')
+     end do
+     deallocate(tmp_data)
+
+   write(*,'(3a,i10)') 'Size of the SIS-selected subspace from ',phiname,': ',nf_sis_avai(iFCDI)
+   write(9,'(3a,i10)') 'Size of the SIS-selected subspace from ',phiname,': ',nf_sis_avai(iFCDI)
+end if  
+
+! release all the spaces
+deallocate(trainy)
+deallocate(trainy_c)
+deallocate(sel.Sexpr)
+deallocate(sel.feat_comp)
+deallocate(sel.feat_score)
+deallocate(sel.feat_name)
+deallocate(available)
+deallocate(order)
+
+if(mpirank==0) then
+  deallocate(sis.Sexpr)
+  deallocate(sis.feat_name)
+  deallocate(sis.feat_score)
+end if
+if(nreject>0) deallocate(reject)
+
+call mpi_barrier(mpi_comm_world,mpierr)
+if(mpirank==0) then
+   mytime.eFC=mpi_wtime()
+   write(9,'(a,f15.2)') 'Time (second) used for this FC: ',mytime.eFC-mytime.sFC
+end if
+
+end subroutine
+
+
+subroutine combine_se(myinp,s1,e1,s2,e2,myops,nf,total_comb) 
+implicit none
+type(feature) myinp
+type(Sexpression) Sexprtmp
+real progress
+real*8 unit_tmp(ubound(myinp.feat_unit,1))
+integer*8 i,j,k,kk,kkk,l,nf,comp_tmp,counter,total_comb,s1,e1,s2,e2
+character(len=*) myops
+character(len=str_len) name_tmp,lastop_tmp*10
+logical skip
+
+counter=0
+progress=0.2
+
+do i=s1,e1
+  
+! no operation
+      IF(trim(adjustl(myops))=='NO') THEN
+          lastop_tmp=''
+          comp_tmp=myinp.feat_comp(i)
+          name_tmp='('//trim(adjustl(myinp.feat_name(i)))//')'
+          unit_tmp=myinp.feat_unit(:,i)
+          Sexprtmp=myinp.Sexpr(i)
+          call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+          cycle
+      END IF
+
+! unary operators
+      counter=counter+1
+      comp_tmp=myinp.feat_comp(i)+1
+      if(comp_tmp>fcomplexity) goto 599
+
+      ! exp
+      IF(index(myops,'(exp)')/=0 ) then
+      if(  index(myinp.lastop(i),'(exp')==0 .and. index(myinp.lastop(i),'(log)')==0 ) then ! avoid exp(exp( and exp(log(
+        lastop_tmp='(exp)'
+        name_tmp='exp('//trim(adjustl(myinp.feat_name(i)))//')'
+        unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(exp)')
+        call Sexpr_ucomb_se(myinp.Sexpr(i),Sexprtmp,'(exp)')
+        call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+      end if
+      END IF
+
+      ! exp-
+      IF(index(myops,'(exp-)')/=0 ) then
+      if( index(myinp.lastop(i),'(exp')==0  .and. index(myinp.lastop(i),'(log)')==0 ) then ! avoid exp(exp( and exp(log(
+        lastop_tmp='(exp-)'
+        name_tmp='exp(-'//trim(adjustl(myinp.feat_name(i)))//')'
+        unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(exp-)')
+        call Sexpr_ucomb_se(myinp.Sexpr(i),Sexprtmp,'(exp-)')
+        call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+      end if
+      END IF
+
+      ! ^-1
+      IF(index(myops,'(^-1)')/=0) then
+        if(minval(abs(evaluator_se(myinp.Sexpr(i))))>1d-50 ) then  ! avoid divided by zero
+           lastop_tmp='(^-1)'
+           name_tmp='('//trim(adjustl(myinp.feat_name(i)))//')^-1'
+           unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(^-1)')
+           call Sexpr_ucomb_se(myinp.Sexpr(i),Sexprtmp,'(^-1)')
+           call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+        end if
+       END IF
+
+      ! scd: Standard Cauchy Distribution
+      IF(index(myops,'(scd)')/=0) then
+           lastop_tmp='(scd)'
+           name_tmp='scd('//trim(adjustl(myinp.feat_name(i)))//')'
+           unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(scd)')
+           call Sexpr_ucomb_se(myinp.Sexpr(i),Sexprtmp,'(scd)')
+           call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+       END IF
+
+      ! ^2
+      IF(index(myops,'(^2)')/=0) then
+        if(index(myinp.lastop(i),'(sqrt)')==0 ) then ! avoid (sqrt())^2
+           lastop_tmp='(^2)'
+           name_tmp='('//trim(adjustl(myinp.feat_name(i)))//')^2'
+           unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(^2)')
+           call Sexpr_ucomb_se(myinp.Sexpr(i),Sexprtmp,'(^2)')
+           call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+        end if
+       END IF
+
+      ! ^3
+      IF(index(myops,'(^3)')/=0) then
+        if(index(myinp.lastop(i),'(cbrt)')==0 ) then ! avoid (cbrt())^3
+         lastop_tmp='(^3)'
+         name_tmp='('//trim(adjustl(myinp.feat_name(i)))//')^3'
+         unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(^3)')
+         call Sexpr_ucomb_se(myinp.Sexpr(i),Sexprtmp,'(^3)')
+         call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+       end if
+     END IF
+
+      ! ^6
+      IF(index(myops,'(^6)')/=0) then
+         lastop_tmp='(^6)'
+         name_tmp='('//trim(adjustl(myinp.feat_name(i)))//')^6'
+         unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(^6)')
+         call Sexpr_ucomb_se(myinp.Sexpr(i),Sexprtmp,'(^6)')
+         call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+       END IF
+
+      ! sqrt
+      IF(index(myops,'(sqrt)')/=0) then
+        if(index(myinp.lastop(i),'(^2)')==0 ) then  ! avoid sqrt((^2))
+          if( minval(evaluator_se(myinp.Sexpr(i)))>0  ) then
+              lastop_tmp='(sqrt)'     
+              name_tmp='sqrt('//trim(adjustl(myinp.feat_name(i)))//')'
+              unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(sqrt)')
+              call Sexpr_ucomb_se(myinp.Sexpr(i),Sexprtmp,'(sqrt)')
+              call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+          end if
+         end if
+       END IF
+
+      ! cbrt: cube root
+      IF(index(myops,'(cbrt)')/=0) then
+        if(index(myinp.lastop(i),'(^3)')==0 ) then  ! avoid cbrt((^3))
+              lastop_tmp='(cbrt)'
+              name_tmp='cbrt('//trim(adjustl(myinp.feat_name(i)))//')'
+              unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(cbrt)')
+              call Sexpr_ucomb_se(myinp.Sexpr(i),Sexprtmp,'(cbrt)')
+              call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+         end if
+       END IF
+
+      ! log
+      IF(index(myops,'(log)')/=0) then
+        if( index(myinp.lastop(i),'(exp')==0 .and. index(myinp.lastop(i),'(log)')==0  ) then ! avoid log(exp( and log(log(
+          if( minval(evaluator_se(myinp.Sexpr(i)))>0 ) then
+              lastop_tmp='(log)'
+              name_tmp='log('//trim(adjustl(myinp.feat_name(i)))//')'
+              unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(log)')
+              call Sexpr_ucomb_se(myinp.Sexpr(i),Sexprtmp,'(log)')
+              call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+          end if
+         end if
+       END IF
+
+      ! sin
+      IF(index(myops,'(sin)')/=0) then
+         lastop_tmp='(sin)'
+         name_tmp='sin('//trim(adjustl(myinp.feat_name(i)))//')'
+         unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(sin)')
+         call Sexpr_ucomb_se(myinp.Sexpr(i),Sexprtmp,'(sin)')
+         call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+     END IF
+
+      ! cos
+      IF(index(myops,'(cos)')/=0) then
+         lastop_tmp='(cos)'
+         name_tmp='cos('//trim(adjustl(myinp.feat_name(i)))//')'
+         unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,i),'(cos)')
+         call Sexpr_ucomb_se(myinp.Sexpr(i),Sexprtmp,'(cos)')
+         call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+     END IF
+
+599  continue
+
+  ! binary operators
+  do j=s2,e2
+
+      if( j>=s1 .and. j<=i ) cycle
+
+      counter=counter+1
+      comp_tmp=myinp.feat_comp(i)+myinp.feat_comp(j)+1
+      if(comp_tmp>fcomplexity) goto 602
+
+      ! sum and subtract 
+       IF(index(myops,'(+)')/=0 .or. index(myops,'(-)')/=0  .or. index(myops,'(|-|)')/=0   ) THEN
+
+               ! different units
+               if ( maxval(abs(myinp.feat_unit(:,i)-myinp.feat_unit(:,j)))>1d-8 ) goto 600
+
+               !---
+               IF(index(myops,'(+)')/=0) then
+                   lastop_tmp='(+)'
+                   name_tmp='('//trim(adjustl(myinp.feat_name(i)))//'+'//trim(adjustl(myinp.feat_name(j)))//')'
+                   unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,j),'(+)')  
+                   call Sexpr_bcomb_se(myinp.Sexpr(i),myinp.Sexpr(j),Sexprtmp,'(+)')
+                   call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+                END IF
+                !---
+
+                IF(index(myops,'(-)')/=0) then  ! A-B
+                   lastop_tmp='(-)'
+                   name_tmp='('//trim(adjustl(myinp.feat_name(i)))//'-'//trim(adjustl(myinp.feat_name(j)))//')'
+                   if(index(myops,'(+)')==0) unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,j),'(-)')   
+                   call Sexpr_bcomb_se(myinp.Sexpr(i),myinp.Sexpr(j),Sexprtmp,'(-)')
+                   call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+
+                  if(icomb<rung) then  ! B-A (no need if icomb=rung because of the coeff.)
+                    ! lastop_tmp='(-)'   ! same with above
+                     name_tmp='('//trim(adjustl(myinp.feat_name(j)))//'-'//trim(adjustl(myinp.feat_name(i)))//')'
+                    ! unit_tmp=dimcomb_se(inp.feat_unit(:,i),dim_in2(:,j),'(-)')   ! same with above
+                     call Sexpr_bcomb_se(myinp.Sexpr(j),myinp.Sexpr(i),Sexprtmp,'(-)')
+                     call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+                  end if
+                 END IF
+
+                !-----
+                IF(index(myops,'(|-|)')/=0) then  ! absolute difference
+                  lastop_tmp='(|-|)'
+                  name_tmp='abs('//trim(adjustl(myinp.feat_name(i)))//'-'//trim(adjustl(myinp.feat_name(j)))//')'
+                  if(index(myops,'(+)')==0) unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,j),'(|-|)')  
+                  call Sexpr_bcomb_se(myinp.Sexpr(i),myinp.Sexpr(j),Sexprtmp,'(|-|)')
+                  call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+                 END IF
+                 !----
+         END IF
+
+600   continue
+
+      ! multiplication and division
+
+           !multiplication
+            IF(index(myops,'(*)')/=0) then
+                lastop_tmp='(*)'
+                name_tmp='('//trim(adjustl(myinp.feat_name(i)))//'*'//trim(adjustl(myinp.feat_name(j)))//')'
+                unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,j),'(*)')
+                call Sexpr_bcomb_se(myinp.Sexpr(i),myinp.Sexpr(j),Sexprtmp,'(*)')
+                call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+            END IF
+
+          ! division
+           IF(index(myops,'(/)')/=0) then
+
+              ! avoid A/(B/C)=(A*C)/B and (A/B)/C=A/(B*C) which already exist via previous (*)
+              if(index(myops,'(*)')/=0 .and. &
+                (index(myinp.lastop(i),'(/)')/=0 .or. index(myinp.lastop(j),'(/)')/=0) .and. &
+                (icomb < rung)  )  goto 602   ! do nothing
+
+              skip=.false.
+              ! avoid A*B/B and A*B/A
+              if(index(myinp.lastop(i),'(*)')/=0) then
+                   k=index(myinp.feat_name(i),trim(adjustl(myinp.feat_name(j))))
+                 if(k>0) then
+                   kk=len_trim(myinp.feat_name(i)(k:))
+                   kkk=len_trim(adjustl(myinp.feat_name(j)))
+                  if( (trim(adjustl(myinp.feat_name(i)(:k-1)))=='(' &   ! j being the left part of i
+                      .and. myinp.feat_name(i)(k+kkk:k+kkk)=='*') .or. &
+                      (kk==kkk+1 .and. myinp.feat_name(i)(k-1:k-1)=='*')) then ! j being the right part of i
+                      if(index(myops,'(^-1)')/=0) then  ! i/j is no need. what about j/i?
+                          goto 602   ! no j/i
+                      else
+                          goto 601   ! do j/i
+                      end if
+                   end if
+                 end if
+              end if
+                 
+              ! avoid A/(A*B) and B/(A*B)
+             if(index(myinp.lastop(j),'(*)')/=0) then
+                 k=index(myinp.feat_name(j),trim(adjustl(myinp.feat_name(i))))
+                if(k>0) then
+                  kk=len_trim(myinp.feat_name(j)(k:))
+                  kkk=len_trim(adjustl(myinp.feat_name(i)))
+                  if((trim(adjustl(myinp.feat_name(j)(:k-1)))=='(' &
+                      .and. myinp.feat_name(j)(k+kkk:k+kkk)=='*') .or. &   ! i being the left part of j
+                     (kk==kkk+1 .and. myinp.feat_name(j)(k-1:k-1)=='*')) then  ! i beingn the right part of j
+                      if(index(myops,'(^-1)')/=0) then  ! j/i is no need. what about i/j?
+                          goto 602  ! no i/j
+                      else
+                          skip=.true.  ! do i/j
+                      end if
+                  end if
+                end if
+             end if
+
+              !i/j
+              !---------
+              lastop_tmp='(/)'
+              name_tmp='('//trim(adjustl(myinp.feat_name(i)))//'/'//trim(adjustl(myinp.feat_name(j)))//')'
+              unit_tmp=dimcomb_se(myinp.feat_unit(:,i),myinp.feat_unit(:,j),'(/)')
+              if(minval(abs(evaluator_se(myinp.Sexpr(j))))>1d-50 ) then 
+              call Sexpr_bcomb_se(myinp.Sexpr(i),myinp.Sexpr(j),Sexprtmp,'(/)')
+              call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+              end if
+              !------
+
+     601      continue
+              if(skip) goto 602
+
+            !j/i
+            !---------
+                 lastop_tmp='(/)'
+                 name_tmp='('//trim(adjustl(myinp.feat_name(j)))//'/'//trim(adjustl(myinp.feat_name(i)))//')'
+                 unit_tmp=dimcomb_se(myinp.feat_unit(:,j),myinp.feat_unit(:,i),'(/)')
+                 if(minval(abs(evaluator_se(myinp.Sexpr(i))))>1d-50) then
+                 call Sexpr_bcomb_se(myinp.Sexpr(j),myinp.Sexpr(i),Sexprtmp,'(/)')
+                 call isgoodf_se(Sexprtmp,name_tmp,lastop_tmp,comp_tmp,unit_tmp,nf)
+                 end if
+           END if
+
+   602   continue
+  end do
+
+  if(float(counter)/float(total_comb)>=progress) then
+     do while( (float(counter)/float(total_comb)) >= (progress +0.2) )
+           progress = progress+0.2
+     end do
+     write(*,'(a,i4,2(a,i15),a,f6.1,a)') &
+     'mpirank = ',mpirank,'  #generated =',nf,'  #selected =',sel.nselect,'  progress =',progress*100,'%'
+     progress=progress+0.2
+  end if
+
+end do
+
+end subroutine
+
+
+function goodf_se(mySexpr,feat_name,feat_unit,feat_comp)
+type(Sexpression) mySexpr
+integer*8 i,j,k,l,ll,mm1,mm2,feat_comp
+real*8 feat_data(npoint),feat_unit(:),scoretmp(2),maxabs
+character(len=*) feat_name
+logical goodf_se,lsame
+
+goodf_se=.true.
+   feat_data=evaluator_se(mySexpr)
+
+   mm1=1
+   mm2=sum(nsample(:ntask))
+  if(maxval(abs(feat_data(mm1:mm2)-feat_data(mm1)))<=1d-8) then
+    goodf_se=.false. ! constant feature.
+    return
+  end if
+
+  maxabs=maxval(abs(feat_data(mm1:mm2)))
+  if(maxabs>1d50 .or. maxabs<=1d-50) then
+    goodf_se=.false. ! intinity or zero
+    return
+  end if
+
+! not selected but can be used for further combination
+if(maxabs>fmax_max .or. maxabs<fmax_min) return
+
+! feature score 
+if(ptype==1) scoretmp=sis_score_se(mySexpr,trainy_c)
+if(ptype==2) scoretmp=sis_score_se(mySexpr,trainy) !classification
+
+if (scoretmp(1)<score_threshold) return
+
+! reject the previously selected features
+if(nreject>0) then
+  feat_name=adjustl(feat_name)
+  lsame=.false.
+  i=0; j=nreject; k=i+ceiling((j-i)/2.0)
+  do while(i/=j)
+     if(trim(feat_name)==trim(reject(k))) then
+       lsame=.true.
+       i=j
+     else if(feat_name<reject(k)) then
+       i=k;
+     else if(feat_name>reject(k)) then
+       j=k;
+     end if
+
+     if(k==i+ceiling((j-i)/2.0)) then
+         i=j
+     else
+         k=i+ceiling((j-i)/2.0)
+     end if
+  end do
+  if(lsame) return
+end if
+
+!--------------------------
+! selected
+!--------------------------
+sel.nselect=sel.nselect+1
+sel.Sexpr(sel.nselect)=mySexpr
+sel.feat_comp(sel.nselect)=feat_comp
+sel.feat_name(sel.nselect)=feat_name
+sel.feat_score(:,sel.nselect)=scoretmp
+if( sel.nselect== nbasic_select+nextra_select ) call update_select_se
+
+end function
+
+
+subroutine addm_gen_se(n,gen)
+! increase array size
+real*8,allocatable:: real2d(:,:)
+integer*8,allocatable:: integer1d(:)
+character(len=str_len),allocatable:: char1d(:),char1d2(:)*10
+integer*8 i,j,k,n
+type(feature) gen
+type(Sexpression),allocatable:: Sexprtmp(:)
+i=npoint
+j=ubound(gen.Sexpr,1)
+
+! gen.Sexpr
+allocate(Sexprtmp(j))
+Sexprtmp=gen.Sexpr
+deallocate(gen.Sexpr)
+allocate(gen.Sexpr(j+n))
+gen.Sexpr(:j)=Sexprtmp
+deallocate(Sexprtmp)
+!-----
+!gen.feat_name
+allocate(char1d(j))
+char1d=gen.feat_name
+deallocate(gen.feat_name)
+allocate(gen.feat_name(j+n))
+gen.feat_name(:j)=char1d
+deallocate(char1d)
+!----
+!gen.lastop
+allocate(char1d2(j))
+char1d2=gen.lastop
+deallocate(gen.lastop)
+allocate(gen.lastop(j+n))
+gen.lastop(:j)=char1d2
+deallocate(char1d2)
+!----
+!gen.feat_comp
+allocate(integer1d(j))
+integer1d=gen.feat_comp
+deallocate(gen.feat_comp)
+allocate(gen.feat_comp(j+n))
+gen.feat_comp(:j)=integer1d
+deallocate(integer1d)
+!gen.feat_unit
+i=ubound(gen.feat_unit,1)
+allocate(real2d(i,j))
+real2d=gen.feat_unit
+deallocate(gen.feat_unit)
+allocate(gen.feat_unit(i,j+n))
+gen.feat_unit(:,:j)=real2d
+deallocate(real2d)
+!--
+end subroutine
+
+
+subroutine addm_inp_se(n,inp)
+! increase array size
+real*8,allocatable:: real2d(:,:)
+type(feature) inp
+type(Sexpression),allocatable:: Sexprtmp(:)
+integer*8,allocatable:: integer1d(:)
+character(len=str_len),allocatable:: char1d(:),char1d2(:)*10
+integer*8 i,j,k,n
+i=npoint
+j=ubound(inp.Sexpr,1)
+
+! inp.Sexpr
+allocate(Sexprtmp(j))
+Sexprtmp=inp.Sexpr
+deallocate(inp.Sexpr)
+allocate(inp.Sexpr(j+n))
+inp.Sexpr(:j)=Sexprtmp
+deallocate(Sexprtmp)
+!-----
+!inp.feat_name
+allocate(char1d(j))
+char1d=inp.feat_name
+deallocate(inp.feat_name)
+allocate(inp.feat_name(j+n))
+inp.feat_name(:j)=char1d
+deallocate(char1d)
+!----
+!inp.lastop
+allocate(char1d2(j))
+char1d2=inp.lastop
+deallocate(inp.lastop)
+allocate(inp.lastop(j+n))
+inp.lastop(:j)=char1d2
+deallocate(char1d2)
+!---
+!inp.feat_comp
+allocate(integer1d(j))
+integer1d=inp.feat_comp
+deallocate(inp.feat_comp)
+allocate(inp.feat_comp(j+n))
+inp.feat_comp(:j)=integer1d
+deallocate(integer1d)
+!inp.feat_unit
+i=ubound(inp.feat_unit,1)
+allocate(real2d(i,j))
+real2d=inp.feat_unit
+deallocate(inp.feat_unit)
+allocate(inp.feat_unit(i,j+n))
+inp.feat_unit(:,:j)=real2d
+deallocate(real2d)
+!--
+end subroutine
+
+
+function dimcomb_se(dim1,dim2,op)
+! calculate the units for new features
+! unary operator: set dim1 and dim2 the same
+real*8 dim1(:),dim2(:),dimcomb_se(ubound(dim1,1))
+character(len=*) op
+integer i,j,k
+
+if(trim(adjustl(op))=='(+)' .or. trim(adjustl(op))=='(-)' .or. trim(adjustl(op))=='(|-|)'  ) then
+   do i=1,ubound(dim1,1)
+      if(abs(dim1(i)-dim2(i))>1d-8) stop 'Error: different units in linear combination'
+   end do
+   dimcomb_se=dim1
+else if(trim(adjustl(op))=='(*)') then
+   dimcomb_se=dim1+dim2
+else if(trim(adjustl(op))=='(/)') then
+   dimcomb_se=dim1-dim2
+else if(trim(adjustl(op))=='(exp)') then   
+   dimcomb_se=0.d0
+else if(trim(adjustl(op))=='(exp-)') then
+   dimcomb_se=0.d0
+else if(trim(adjustl(op))=='(log)') then
+   dimcomb_se=0.d0
+else if(trim(adjustl(op))=='(scd)') then 
+   dimcomb_se=0.d0
+else if(trim(adjustl(op))=='(sin)') then
+   dimcomb_se=0.d0
+else if(trim(adjustl(op))=='(cos)') then
+   dimcomb_se=0.d0
+else if(trim(adjustl(op))=='(^-1)') then
+   dimcomb_se=dim1*(-1)
+else if(trim(adjustl(op))=='(^2)') then
+   dimcomb_se=dim1*2
+else if(trim(adjustl(op))=='(^3)') then
+   dimcomb_se=dim1*3
+else if(trim(adjustl(op))=='(^6)') then
+   dimcomb_se=dim1*6
+else if(trim(adjustl(op))=='(sqrt)') then
+   dimcomb_se=dim1/2.d0
+else if(trim(adjustl(op))=='(cbrt)') then
+   dimcomb_se=dim1/3.d0
+end if
+end function
+
+
+subroutine writeout_se(phiname,i,k)
+integer*8 i,j,k
+character(len=*) phiname
+write(*,'(3a,i15)') 'Total number of features in the space ',trim(phiname),':',k
+write(9,'(3a,i15)') 'Total number of features in the space ',trim(phiname),':',k
+end subroutine
+
+
+function simpler_se(complexity1,complexity2,name1,name2)
+integer*8 simpler_se,complexity1,complexity2
+character(len=*) name1,name2
+
+if(complexity1<complexity2) then
+   simpler_se=1
+else if(complexity1>complexity2) then
+   simpler_se=2
+else if(complexity1==complexity2) then
+   if(len_trim(name1)<len_trim(name2) .or. &
+     (len_trim(name1)==len_trim(name2) .and. trim(name1)<=trim(name2) ) ) then
+      simpler_se=1
+   else
+      simpler_se=2
+   end if
+end if
+end function
+
+
+subroutine update_availability_se(available,pavailable)
+logical available(:),pavailable(:)
+if( any(available) ) then
+   pavailable(mpirank+1)=.true.
+else
+   pavailable(mpirank+1)=.false.
+end if
+end subroutine
+
+
+subroutine dup_pcheck_se(nfpcore_this,fID,fname,complexity,mySexpr,order,available)
+! output array 'available'
+real*8 fID(:)
+integer*8 i,j,k,l,ll,mpii,mpij,nfpcore_this(:),mpim(mpisize),mpiloc(mpisize),order(:),loc(1),&
+          complexity(:),simpler_result
+character(len=*) fname(:)
+logical available(:)
+real*8,allocatable:: compidentity(:)
+character(len=str_len),allocatable:: compname(:)
+integer*8,allocatable:: compcomplexity(:)
+type(Sexpression),allocatable:: Sexpr4comp(:)
+type(Sexpression) mySexpr(:)
+
+IF(mpisize>1) THEN
+   mpim=nfpcore_this  ! number of features in each core
+   do i=1,mpisize 
+      loc(1:1)=maxloc(mpim)
+      mpiloc(i)=loc(1)  ! sort the cores by the number of features from high to low
+      mpim(loc(1))=-1
+   end do
+
+   do k=1,mpisize-1
+
+     if(nfpcore_this(mpiloc(k))>0) then
+         allocate(compidentity(nfpcore_this(mpiloc(k))))
+         allocate(compname(nfpcore_this(mpiloc(k))))
+         allocate(compcomplexity(nfpcore_this(mpiloc(k))))
+         allocate(Sexpr4comp(nfpcore_this(mpiloc(k))))
+
+         if(mpirank==mpiloc(k)-1) then   ! every core knows the same mpiloc
+           compidentity=fID(:nfpcore_this(mpiloc(k)))
+           compname=fname(:nfpcore_this(mpiloc(k)))
+           compcomplexity=complexity(:nfpcore_this(mpiloc(k)))
+           Sexpr4comp=mySexpr(:nfpcore_this(mpiloc(k)))
+         end if
+
+         ! broadcast compidentity from core mpiloc(k)-1 to all other cores
+          call mpi_bcast(compidentity,nfpcore_this(mpiloc(k)),mpi_double_precision,mpiloc(k)-1,mpi_comm_world,mpierr)
+         ! broadcast the feature expressions
+          call mpi_bcast(compname,nfpcore_this(mpiloc(k))*str_len,mpi_character,mpiloc(k)-1,mpi_comm_world,mpierr)
+         ! broadcast the feature complexities
+          call mpi_bcast(compcomplexity,nfpcore_this(mpiloc(k)),mpi_integer8,mpiloc(k)-1,mpi_comm_world,mpierr)
+         ! broadcast the features
+          do ll=1,nfpcore_this(mpiloc(k))
+           call mpi_bcast(Sexpr4comp(ll).list_id(:Smaxlen),Smaxlen,mpi_integer,mpiloc(k)-1,mpi_comm_world,mpierr)
+           call mpi_bcast(Sexpr4comp(ll).list_len,1,mpi_integer,mpiloc(k)-1,mpi_comm_world,mpierr)
+           call mpi_bcast(Sexpr4comp(ll).list_pointer(:2,:Smaxlen),Smaxlen*2,mpi_integer,mpiloc(k)-1,mpi_comm_world,mpierr)
+           call mpi_bcast(Sexpr4comp(ll).list_var(:Smaxlen),Smaxlen*30,mpi_character,mpiloc(k)-1,mpi_comm_world,mpierr)
+           call mpi_bcast(Sexpr4comp(ll).list_op(:Smaxlen),Smaxlen*10,mpi_character,mpiloc(k)-1,mpi_comm_world,mpierr)
+          end do
+         ! do the comparision. All the cores of mpiloc(k:mpisize) will be compared with
+         ! that on the core mpiloc(k)-1. 
+         if( all( mpirank/=(mpiloc(:k)-1) ) .and. nfpcore_this(mpirank+1)>0 ) then ! this core vs core mpiloc(k)-1
+            do mpij=1,nfpcore_this(mpiloc(k))  ! core mpiloc(k)-1
+              l=0; ll=order(nfpcore_this(mpirank+1)+1)  ! order stores features in descending order 
+              i=l+ceiling(float(ll-l)/2.0)   ! bisection method
+
+              124 continue
+              simpler_result=simpler_se(compcomplexity(mpij),complexity(order(i)),compname(mpij),fname(order(i)))
+
+                 ! check for duplication
+                 if(equivalent_se(compidentity(mpij),fID(order(i)),Sexpr4comp(mpij),mySexpr(order(i)) ) ) then 
+                       if(simpler_result==1) then
+                          available(order(i))=.false.
+                       else
+                          compidentity(mpij)=0.d0
+                       end if
+                    cycle
+                 end if
+                 ! if not equal
+                 if(compidentity(mpij)>fID(order(i)) .or.  (compidentity(mpij)==fID(order(i)) .and. &
+                    simpler_result==1 ) ) then   ! mpij is better than i
+                    ll=i   ! replace the right end with i to find a new middle point
+                 else
+                     l=i  ! replace the left end with i to findn a new middle point
+                 end if
+                 if(i==l+ceiling(float(ll-l)/2.0)) cycle  ! i is already the left or right end; no more
+                 i=l+ceiling(float(ll-l)/2.0)   ! the new middle point
+                 goto 124
+            end do
+            call mpi_send(compidentity,nfpcore_this(mpiloc(k)),mpi_double_precision,mpiloc(k)-1,33,&
+                          mpi_comm_world,status,mpierr)
+
+         else if(mpirank==mpiloc(k)-1) then
+             do l=1,mpisize
+                 if( any( l==mpiloc(:k) ) .or. nfpcore_this(l)==0 ) cycle
+            call mpi_recv(compidentity,nfpcore_this(mpiloc(k)),mpi_double_precision,mpi_any_source,33,&
+                      mpi_comm_world,status,mpierr)
+               do i=1,nfpcore_this(mpiloc(k))
+                if(abs(compidentity(i))<1d-8) available(i)=.false.
+               end do
+            end do
+         end if
+
+         deallocate(compidentity)
+         deallocate(compname)
+         deallocate(compcomplexity)
+         deallocate(Sexpr4comp)
+     end if
+
+  end do
+
+  j=0
+  do i=1,nfpcore_this(mpirank+1)
+   if(available(i)) j=j+1
+  end do
+
+  if(mpirank/=0)  then
+    call mpi_send(j,1,mpi_integer8,0,1,mpi_comm_world,status,mpierr)
+  else
+    do k=1,mpisize-1
+      call mpi_recv(i,1,mpi_integer8,k,1,mpi_comm_world,status,mpierr)
+      j=j+i
+    end do
+  end if
+
+END IF
+
+end subroutine
+
+subroutine sure_indep_screening_se(nfpcore_this,available)
+! sure independence screening
+real*8 tmp,pscore(2,mpisize)
+integer*8 i,j,k,l,ll,mpii,mpij,nfpcore_this(:),loc(2),simpler_result,pcomplexity(mpisize)
+character(len=str_len) pname(mpisize)
+logical available(:),pavailable(mpisize)
+
+
+if(mpirank==0) write(*,'(/a)') 'Sure Independence Screening on selected features ...'
+
+pavailable=.false.
+if(nfpcore_this(mpirank+1)>0) call update_availability_se(available,pavailable)
+do l=1,mpisize
+ call mpi_bcast(pavailable(l),1,mpi_logical,l-1,mpi_comm_world,mpierr) 
+end do
+
+! selection starts ...
+i=0   ! count of selected features
+do while( any(pavailable) .and. i<nf_sis(iFCDI) )
+
+      i=i+1
+
+      ! find the max score on each core
+      if(nfpcore_this(mpirank+1)>0) then
+        loc(2:2)=maxloc(sel.feat_score(1,:sel.nselect)) ! score_2 is less important or not used
+        tmp=sel.feat_score(1,loc(2))  ! (loc(1) to be used for other purpose
+      end if
+      do l=1,nfpcore_this(mpirank+1)  ! equal scores
+        if(  abs(tmp-sel.feat_score(1,l))<=1d-8 ) then
+           if( sel.feat_score(2,l)-sel.feat_score(2,loc(2))>1d-8 .or. &
+               (abs(sel.feat_score(2,l)-sel.feat_score(2,loc(2)))<=1d-8 .and. &
+            simpler_se(sel.feat_comp(l),sel.feat_comp(loc(2)),sel.feat_name(l),sel.feat_name(loc(2)))==1 ) )  loc(2)=l
+        end if
+      end do
+      if(nfpcore_this(mpirank+1)>0) then
+        pscore(:,mpirank+1)=sel.feat_score(:,loc(2))  ! location of max score of this core
+        pcomplexity(mpirank+1)=sel.feat_comp(loc(2))  ! corresponding feature complexity
+        pname(mpirank+1)=sel.feat_name(loc(2))  ! corresponding feature name
+      else
+        pscore(:,mpirank+1)=-1   ! simply a negative value to indicate empty
+      end if
+
+      do l=1,mpisize  ! inform other cores about local best features
+        call mpi_bcast(pscore(:,l),2,mpi_double_precision,l-1,mpi_comm_world,mpierr)
+        call mpi_bcast(pcomplexity(l),1,mpi_integer8,l-1,mpi_comm_world,mpierr)
+        call mpi_bcast(pname(l),str_len,mpi_character,l-1,mpi_comm_world,mpierr)
+      end do
+
+      !find the max score between cores
+      loc(1:1)=maxloc(pscore(1,:))
+      tmp=maxval(pscore(1,:))
+      do l=1,mpisize  ! if equal score between features
+        if(  abs(tmp-pscore(1,l))<=1d-8 ) then
+           if( pscore(2,l)-pscore(2,loc(1))>1d-8 .or.  ( abs(pscore(2,l)-pscore(2,loc(1)))<=1d-8 .and. &
+               simpler_se(pcomplexity(l),pcomplexity(loc(1)),pname(l),pname(loc(1)))==1 ) ) loc(1)=l
+        end if
+      end do
+
+      ! save the highest-scored feature
+      if((loc(1)-1)==mpirank) then
+         if(mpirank==0) then
+             sis.Sexpr(i)=sel.Sexpr(loc(2))
+             sis.feat_name(i)=sel.feat_name(loc(2))
+             sis.feat_score(:,i)=sel.feat_score(:,loc(2))
+         else
+        call mpi_send(sel.Sexpr(loc(2)).list_id(:Smaxlen),Smaxlen,mpi_integer,0,96,mpi_comm_world,status,mpierr)
+        call mpi_send(sel.Sexpr(loc(2)).list_len,1,mpi_integer,0,97,mpi_comm_world,status,mpierr)
+        call mpi_send(sel.Sexpr(loc(2)).list_pointer(:2,:Smaxlen),Smaxlen*2,mpi_integer,0,98,mpi_comm_world,status,mpierr)
+        call mpi_send(sel.Sexpr(loc(2)).list_var(:Smaxlen),Smaxlen*30,mpi_character,0,99,mpi_comm_world,status,mpierr)
+        call mpi_send(sel.Sexpr(loc(2)).list_op(:Smaxlen),Smaxlen*10,mpi_character,0,100,mpi_comm_world,status,mpierr)
+        call mpi_send(sel.feat_name(loc(2)),str_len,mpi_character,0,101,mpi_comm_world,status,mpierr)
+        call mpi_send(sel.feat_score(:,loc(2)),2,mpi_double_precision,0,103,mpi_comm_world,status,mpierr)
+         end if
+         available(loc(2))=.false.   ! avoid to be selected again
+         sel.feat_score(:,loc(2))=-1  
+
+      end if
+      if(mpirank==0 .and. mpirank/=(loc(1)-1) ) then
+     call mpi_recv(sis.Sexpr(i).list_id(:Smaxlen),Smaxlen,mpi_integer,loc(1)-1,96,mpi_comm_world,status,mpierr)
+     call mpi_recv(sis.Sexpr(i).list_len,1,mpi_integer,loc(1)-1,97,mpi_comm_world,status,mpierr)
+     call mpi_recv(sis.Sexpr(i).list_pointer(:2,:Smaxlen),Smaxlen*2,mpi_integer,loc(1)-1,98,mpi_comm_world,status,mpierr)
+     call mpi_recv(sis.Sexpr(i).list_var(:Smaxlen),Smaxlen*30,mpi_character,loc(1)-1,99,mpi_comm_world,status,mpierr)
+     call mpi_recv(sis.Sexpr(i).list_op(:Smaxlen),Smaxlen*10,mpi_character,loc(1)-1,100,mpi_comm_world,status,mpierr)
+     call mpi_recv(sis.feat_name(i),str_len,mpi_character,loc(1)-1,101,mpi_comm_world,status,mpierr)
+     call mpi_recv(sis.feat_score(:,i),2,mpi_double_precision,loc(1)-1,103,mpi_comm_world,status,mpierr)
+      end if
+      !---
+
+      if(nfpcore_this(mpirank+1)>0) call update_availability_se(available,pavailable)
+      do l=1,mpisize
+       call mpi_bcast(pavailable(l),1,mpi_logical,l-1,mpi_comm_world,mpierr)
+      end do
+
+end do
+!--------------------
+
+
+nf_sis_avai(iFCDI)=i    ! the actual number of selected features
+
+end subroutine
+
+
+subroutine dup_scheck_se(num,fID,fname,complexity,mySexpr,order,available)
+! duplication check within each core
+! output the arrays "order" and "available"
+integer*8 i,j,l,ll,order(:),n,num,complexity(:),simpler_result
+real*8 fID(:)
+character(len=*) fname(:)
+logical available(:)
+type(Sexpression) mySexpr(:)
+
+IF(num==0) THEN
+  n=0
+ELSE
+
+! creating the descending ordering via bisection method 
+order(1)=1  ! initial first entry
+n=1
+do i=2,num
+
+   l=0; ll=n;   ! left and right end
+   j=l+ceiling(float(ll-l)/2.0)  ! the middle one
+
+   123 continue
+   simpler_result=simpler_se(complexity(i),complexity(order(j)),fname(i),fname(order(j)))
+
+   ! check for duplication
+      if(equivalent_se(fID(i),fID(order(j)),mySexpr(i),mySexpr(order(j))) ) then
+           if( simpler_result==1 ) then
+              available(order(j))=.false.
+              order(j)=i
+           else
+              available(i)=.false.
+           end if
+           cycle
+      end if
+     ! if not equal
+     if(fID(i)>fID(order(j)) .or. (fID(i)==fID(order(j)) .and. simpler_result==1 ) ) then  ! i is better
+        ll=j    ! replace the ll with the j to find a new middle point
+        if(j==l+ceiling(float(ll-l)/2.0)) then   ! if j is already the left end
+          order(j+1:n+1)=order(j:n)  
+          order(j)=i   ! insert i to that before the original j
+          n=n+1
+          cycle
+        end if
+      else
+         l=j   ! j is better; replace the left end with j to find a new middle point
+         if(j==l+ceiling(float(ll-l)/2.0)) then  ! if j is already the right end
+           if(n>j) order(j+2:n+1)=order(j+1:n)
+           order(j+1)=i   ! insert i to that after j
+           n=n+1
+           cycle
+         end if
+      end if
+
+      j=l+ceiling(float(ll-l)/2.0)  ! the new middle point
+      goto 123
+end do
+
+END IF
+
+order(num+1)=n  ! store the number of features after check
+if(mpirank/=0)  then
+  call mpi_send(n,1,mpi_integer8,0,1,mpi_comm_world,status,mpierr)
+else
+  do l=1,mpisize-1
+    call mpi_recv(i,1,mpi_integer8,l,1,mpi_comm_world,status,mpierr)
+    n=n+i
+ end do
+end if
+
+end subroutine
+
+
+function sis_score_se(mySexpr,yyy)   
+! correlation between a feature 'feat' and the target 'yyy'
+! sis_score_se returns a vector with 2 elements
+integer i,j,mm1,mm2,mm3,mm4,k,kk,l,overlap_n,nf1,nf2,itask,nconvexpair
+real*8 feat(npoint),sdfeat(npoint),tmp(ntask),sis_score_se(2),yyy(:),xnorm(ntask),xmean(ntask),&
+       overlap_length,length_tmp,feat_tmp1(npoint),feat_tmp2(npoint),mindist,minlen
+logical isoverlap
+type(Sexpression) mySexpr
+
+feat=evaluator_se(mySexpr)
+
+if(ptype==1) then  
+     do j=1,ntask
+         mm1=sum(nsample(:j-1))+1
+         mm2=sum(nsample(:j))
+         xmean(j)=sum(feat(mm1:mm2))/nsample(j)
+         sdfeat(mm1:mm2)=feat(mm1:mm2)-xmean(j)  
+         xnorm(j)=sqrt(sum((sdfeat(mm1:mm2))**2))
+         if(xnorm(j)>1d-50) then
+            sdfeat(mm1:mm2)=sdfeat(mm1:mm2)/xnorm(j)  ! standardization to the features
+            tmp(j)=abs(sum(sdfeat(mm1:mm2)*yyy(mm1:mm2))) ! |xy|. Donot use normalized yyy here!
+         else
+            tmp(j)=0.d0
+         end if
+     end do
+   ! score ranges from 0 to 1
+   sis_score_se(1)=sqrt(sum(tmp**2)/ntask)  ! quadratic mean of the scores of different tasks
+   sis_score_se(1)=sis_score_se(1)/sqrt(sum(yyy**2)/ntask)  ! normalization
+   sis_score_se(2)=1.d0   ! not used
+
+else if(ptype==2) then  
+   mindist=-1d10
+   overlap_n=0
+   overlap_length=0.d0
+   isoverlap=.false.
+   nconvexpair=0
+   do itask=1,ntask
+
+   ! calculate overlap between domains of property i
+        do i=1,ngroup(itask,1000)-1   ! ngroup(itask,1000) store the number of groups in this task
+            if(itask==1) then
+              mm1=sum(ngroup(itask,:i-1))+1
+              mm2=sum(ngroup(itask,:i))
+            else            
+              mm1=sum(nsample(:itask-1))+sum(ngroup(itask,:i-1))+1
+              mm2=sum(nsample(:itask-1))+sum(ngroup(itask,:i))
+            end if
+            nf1=0
+            do k=mm1,mm2
+               if(yyy(k)<0.5) then  ! y=1: classified, y=0: unclassified
+                   nf1=nf1+1
+                   feat_tmp1(nf1)=feat(k) 
+               end if
+            end do       
+          do j=i+1,ngroup(itask,1000) 
+               if(itask==1) then
+                 mm3=sum(ngroup(itask,:j-1))+1
+                 mm4=sum(ngroup(itask,:j))
+               else
+                 mm3=sum(nsample(:itask-1))+sum(ngroup(itask,:j-1))+1
+                 mm4=sum(nsample(:itask-1))+sum(ngroup(itask,:j))
+               end if
+               nf2=0
+               do k=mm3,mm4
+                   if(yyy(k)<0.5) then
+                       nf2=nf2+1
+                       feat_tmp2(nf2)=feat(k) 
+                   end if
+               end do
+               if(isconvex(itask,i)==0 .and. isconvex(itask,j)==0) cycle ! both are not convex domains
+
+               if(isconvex(itask,i)==1 .and. isconvex(itask,j)==1) then
+                  call convex1d_overlap(feat_tmp1(:nf1),feat_tmp2(:nf2),bwidth,k,length_tmp)
+                  overlap_n=overlap_n+k
+                  nconvexpair=nconvexpair+1
+                  if(length_tmp>=0.d0) isoverlap=.true.
+                   ! which feature is shorter
+                   minlen=min(maxval(feat_tmp1(:nf1))-minval(feat_tmp1(:nf1)),maxval(feat_tmp2(:nf2))&
+                          -minval(feat_tmp2(:nf2)))
+                   if(length_tmp<0.d0) then  ! if separated
+                      if(length_tmp>mindist) mindist=length_tmp  ! renew the worst separation
+                   else if(length_tmp>=0.d0 .and. minlen==0.d0) then  ! if overlapped and one feature is 0D
+                      overlap_length=overlap_length+1.d0  ! totally overlapped
+                   else if(length_tmp>=0.d0 .and. minlen>0.d0) then  ! if not separated and no 0D feature
+                      overlap_length=overlap_length+length_tmp/minlen  ! calculate total overlap
+                   end if
+               else if (isconvex(itask,i)==0 .and. isconvex(itask,j)==1) then  !count the number of i-data in j-domain
+                  overlap_n=overlap_n+convex1d_in(feat(mm3:mm4),feat_tmp1(:nf1),bwidth)
+               else if (isconvex(itask,i)==1 .and. isconvex(itask,j)==0) then !count the number of j-data in i-domain
+                  overlap_n=overlap_n+convex1d_in(feat(mm1:mm2),feat_tmp2(:nf2),bwidth)
+               end if
+
+          end do  ! j
+        end do  ! i
+   end do  ! itask
+
+   sis_score_se(1)=float(overlap_n)  ! >=0, higher sis_score --> worse feature
+   sis_score_se(1)=1.d0/(sis_score_se(1)+1.d0)  ! transform to <=1, higher sis_score --> better feature
+   if(isoverlap) then  ! there are domains overlapped
+     sis_score_se(2)=overlap_length/float(nconvexpair)     ! second metric, <=1, higher --> worse
+     sis_score_se(2)=1.d0/(sis_score_se(2)+1.0)  ! transform to <=1, higher --> better
+   else ! separated length
+     sis_score_se(2)=-mindist   ! separated length between domains,positive
+   end if
+
+end if
+end function
+
+
+subroutine isgoodf_se(mySexpr,feat_name,lastop,feat_comp,feat_unit,nf)
+real*8 feat_unit(:)
+character(len=*) feat_name,lastop
+integer*8 nf,feat_comp
+type(Sexpression) mySexpr
+
+if(goodf_se(mySexpr,feat_name,feat_unit,feat_comp)) then
+  nf=nf+1
+  if(icomb < rung) then
+     if(nf>ubound(gen.Sexpr,1)) call addm_gen_se(int8(ceiling(100000.0/mpisize)),gen)
+     gen.Sexpr(nf)=mySexpr
+     gen.feat_name(nf)=feat_name
+     gen.lastop(nf)=lastop
+     gen.feat_comp(nf)=feat_comp
+     gen.feat_unit(:,nf)=feat_unit
+  end if
+end if
+end subroutine
+
+
+subroutine update_select_se
+! update the selected space
+! bisection method for descending order
+integer*8 i,j,k,l,ll,order(sel.nselect),n,tmp,tmpcomplexity(nbasic_select),simpler_result
+real*8 tmpscore(2,nbasic_select)
+character(len=str_len) tmpname(nbasic_select)
+type(Sexpression) Sexprtmp(nbasic_select)
+
+! creating an ordering from large to small
+order(1)=1   ! assuming the first feature being the best (highest score)
+n=1  ! count of features saved in 'order'
+
+do i=2,sel.nselect   ! compare feaure i with the j located at middle of order(1:n)
+
+  l=0; ll=n;
+  j=l+ceiling(float(ll-l)/2.0)   ! j is at middle between l and ll.
+ 
+  125 continue
+  simpler_result=simpler_se(sel.feat_comp(i),sel.feat_comp(order(j)),sel.feat_name(i),sel.feat_name(order(j)))
+
+  ! get rid of duplicated features
+  if(equivalent_se(sel.feat_score(1,i),sel.feat_score(1,order(j)),sel.Sexpr(i),sel.Sexpr(order(j))) ) then
+        if( simpler_result==1) order(j)=i  ! update the order
+        cycle
+  end if
+
+  ! bisection method for descending order
+  if( (sel.feat_score(1,i)>sel.feat_score(1,order(j))) .or.  ((sel.feat_score(1,i)==sel.feat_score(1,order(j))) &
+       .and. sel.feat_score(2,i)>sel.feat_score(2,order(j))) .or.  ((sel.feat_score(1,i)==sel.feat_score(1,order(j))) &
+       .and. sel.feat_score(2,i)==sel.feat_score(2,order(j)) .and. simpler_result==1) )then 
+      ll=j   ! i is better. Replace the right end ll with j, and find a new middle point
+      if(j==l+ceiling(float(ll-l)/2.0)) then  ! if j is already the left end
+        order(j+1:n+1)=order(j:n)   ! move all the original j:n features by 1 step
+        order(j)=i ! insert the i right before the j, because score_i > score_j
+        n=n+1  ! size increased by 1
+        cycle
+      end if
+   else   ! j is better. Replace the left end l with j, and find a new middle point
+       l=j
+       if(j==l+ceiling(float(ll-l)/2.0)) then
+         if(n>j) order(j+2:n+1)=order(j+1:n)
+         order(j+1)=i
+         n=n+1
+         cycle
+       end if
+   end if
+
+   j=l+ceiling(float(ll-l)/2.0)  ! the new middle point
+   goto 125
+end do
+
+sel.nselect=min(n,nbasic_select)   ! reduce the space by removing features with low ranking
+
+! reordering
+do i=1,sel.nselect
+Sexprtmp(i)=sel.Sexpr(order(i))
+tmpcomplexity(i)=sel.feat_comp(order(i))
+tmpname(i)=sel.feat_name(order(i))
+tmpscore(:,i)=sel.feat_score(:,order(i))
+end do
+
+! update the selected features
+score_threshold=tmpscore(1,sel.nselect)
+sel.Sexpr(:sel.nselect)=Sexprtmp(:sel.nselect)
+sel.feat_comp(:sel.nselect)=tmpcomplexity(:sel.nselect)
+sel.feat_name(:sel.nselect)=tmpname(:sel.nselect)
+sel.feat_score(:,:sel.nselect)=tmpscore(:,:sel.nselect)
+
+end subroutine
+
+
+function equivalent_se(score1,score2,mySexpr1,mySexpr2)
+! check if two features are the same or highly correlated.
+type(Sexpression) mySexpr1,mySexpr2
+real*8 score1,score2,diff,feat1(npoint),feat2(npoint),mean1,mean2,sd1,sd2,ffcorr
+logical equivalent_se
+
+equivalent_se=.false.
+diff=score1-score2
+if(abs(diff)>1d-8) return
+
+feat1=evaluator_se(mySexpr1)
+feat2=evaluator_se(mySexpr2)
+
+mean1=sum(feat1)/float(npoint)
+mean2=sum(feat2)/float(npoint)
+sd1=sqrt((sum((feat1-mean1)**2))/float(npoint))
+sd2=sqrt((sum((feat2-mean2)**2))/float(npoint))
+ffcorr=(sum((feat1-mean1)*(feat2-mean2)))/float(npoint)/sd1/sd2
+
+if(ffcorr>1.d0) ffcorr=1.d0    ! avoid numerical noise
+if(ffcorr<-1.d0) ffcorr=-1.d0
+
+if( abs(ffcorr)> 0.99 ) equivalent_se=.true.
+
+end function
+
+
+subroutine Sexpr_bcomb_se(Sexpr1,Sexpr2,Sexpr12,op)
+type(Sexpression) Sexpr1,Sexpr2,Sexpr12
+integer i,j,k1,k2
+character(len=*) op
+
+     k1=Sexpr1.list_len
+     k2=Sexpr2.list_len
+     Sexpr12=Sexpr1
+     Sexpr12.list_id(k1+1:k1+k2)=Sexpr2.list_id(1:k2)+k1
+     Sexpr12.list_op(k1+1:k1+k2)=Sexpr2.list_op(1:k2)
+     Sexpr12.list_var(k1+1:k1+k2)=Sexpr2.list_var(1:k2)
+     do i=1,k2
+       do j=1,2
+        if(Sexpr2.list_pointer(j,i)/=0) then
+           Sexpr12.list_pointer(j,k1+i)=Sexpr2.list_pointer(j,i)+k1
+        else
+           Sexpr12.list_pointer(j,k1+i)=0
+        end if
+      end do
+     end do
+
+     Sexpr12.list_len=k1+k2+1
+     Sexpr12.list_id(Sexpr12.list_len)=Sexpr12.list_len
+     Sexpr12.list_var(Sexpr12.list_len)=' '   
+     Sexpr12.list_pointer(:,Sexpr12.list_len)=(/k1,k1+k2/)
+     select case(trim(adjustl(op)))
+        case('(+)')
+          Sexpr12.list_op(Sexpr12.list_len)='(+)'
+        case('(-)')
+          Sexpr12.list_op(Sexpr12.list_len)='(-)'
+        case('(*)')
+          Sexpr12.list_op(Sexpr12.list_len)='(*)'
+        case('(/)')
+          Sexpr12.list_op(Sexpr12.list_len)='(/)'
+        case('(|-|)')
+          Sexpr12.list_op(Sexpr12.list_len)='(|-|)'
+      end select
+end subroutine
+
+
+subroutine Sexpr_ucomb_se(Sexpr1,Sexpr2,op)
+type(Sexpression) Sexpr1,Sexpr2
+integer i,j,k1
+character(len=*) op
+
+     Sexpr2=Sexpr1
+     Sexpr2.list_len=Sexpr2.list_len+1
+     Sexpr2.list_id(Sexpr2.list_len)=Sexpr2.list_len
+     Sexpr2.list_var(Sexpr2.list_len)=' '
+     Sexpr2.list_pointer(:,Sexpr2.list_len)=(/Sexpr2.list_len-1,0/)
+     select case(trim(adjustl(op)))
+        case('(exp)')
+          Sexpr2.list_op(Sexpr2.list_len)='(exp)'
+        case('(exp-)')
+          Sexpr2.list_op(Sexpr2.list_len)='(exp-)'
+        case('(^-1)')
+          Sexpr2.list_op(Sexpr2.list_len)='(^-1)'
+        case('(^2)')
+          Sexpr2.list_op(Sexpr2.list_len)='(^2)'
+        case('(^3)')
+          Sexpr2.list_op(Sexpr2.list_len)='(^3)'
+        case('(sqrt)')
+          Sexpr2.list_op(Sexpr2.list_len)='(sqrt)'
+        case('(cbrt)')
+          Sexpr2.list_op(Sexpr2.list_len)='(cbrt)'
+        case('(log)')
+          Sexpr2.list_op(Sexpr2.list_len)='(log)'
+        case('(scd)')
+          Sexpr2.list_op(Sexpr2.list_len)='(scd)'
+        case('(^6)')
+          Sexpr2.list_op(Sexpr2.list_len)='(^6)'
+        case('(sin)')
+          Sexpr2.list_op(Sexpr2.list_len)='(sin)'
+        case('(cos)')
+          Sexpr2.list_op(Sexpr2.list_len)='(cos)'
+     end select
+
+end subroutine
+
+function evaluator_se(mySexpr)
+! evaluator_se
+integer i,j,k,l,length
+type(Sexpression) mySexpr
+real*8 evaluator_se(npoint)
+real*8 val(npoint,mySexpr.list_len)
+
+length=mySexpr.list_len
+val=0.0
+do i=1,length
+  if(index(mySexpr.list_op(i),'var')/=0) then
+     do j=1,nsf
+        if(trim(adjustl(mySexpr.list_var(i)))==trim(adjustl(pfname(j)))) then
+            val(:,i)=pfdata(:,j)
+            exit
+        end if
+     end do
+  else if (index(mySexpr.list_op(i),'(+)')/=0) then
+     val(:,i)=val(:,mySexpr.list_pointer(1,i))+val(:,mySexpr.list_pointer(2,i))
+  else if (index(mySexpr.list_op(i),'(-)')/=0) then
+     val(:,i)=val(:,mySexpr.list_pointer(1,i))-val(:,mySexpr.list_pointer(2,i))
+  else if (index(mySexpr.list_op(i),'(*)')/=0) then
+     val(:,i)=val(:,mySexpr.list_pointer(1,i))*val(:,mySexpr.list_pointer(2,i))
+  else if (index(mySexpr.list_op(i),'(/)')/=0) then
+     val(:,i)=val(:,mySexpr.list_pointer(1,i))/val(:,mySexpr.list_pointer(2,i))
+  else if (index(mySexpr.list_op(i),'(|-|)')/=0) then
+     val(:,i)=abs(val(:,mySexpr.list_pointer(1,i))-val(:,mySexpr.list_pointer(2,i)))
+  else if (index(mySexpr.list_op(i),'(exp)')/=0) then
+     val(:,i)=exp(val(:,mySexpr.list_pointer(1,i)))
+  else if (index(mySexpr.list_op(i),'(exp-)')/=0) then
+     val(:,i)=exp(-val(:,mySexpr.list_pointer(1,i)))
+  else if (index(mySexpr.list_op(i),'(^2)')/=0) then
+     val(:,i)=(val(:,mySexpr.list_pointer(1,i)))**2
+  else if (index(mySexpr.list_op(i),'(^-1)')/=0) then
+     val(:,i)=(val(:,mySexpr.list_pointer(1,i)))**(-1)
+  else if (index(mySexpr.list_op(i),'(^3)')/=0) then
+     val(:,i)=(val(:,mySexpr.list_pointer(1,i)))**3
+  else if (index(mySexpr.list_op(i),'(sqrt)')/=0) then
+     val(:,i)=sqrt(val(:,mySexpr.list_pointer(1,i)))
+  else if (index(mySexpr.list_op(i),'(cbrt)')/=0) then
+     val(:,i)=(val(:,mySexpr.list_pointer(1,i)))**(1.d0/3.d0)
+  else if (index(mySexpr.list_op(i),'(log)')/=0) then
+     val(:,i)=log(val(:,mySexpr.list_pointer(1,i)))
+  else if (index(mySexpr.list_op(i),'(scd)')/=0) then
+     val(:,i)=1.0d0/(PI*(1.0d0+(val(:,mySexpr.list_pointer(1,i)))**2)) 
+  else if (index(mySexpr.list_op(i),'(^6)')/=0) then
+     val(:,i)=(val(:,mySexpr.list_pointer(1,i)))**6
+  else if (index(mySexpr.list_op(i),'(sin)')/=0) then
+     val(:,i)=sin(val(:,mySexpr.list_pointer(1,i)))
+  else if (index(mySexpr.list_op(i),'(cos)')/=0) then
+     val(:,i)=cos(val(:,mySexpr.list_pointer(1,i)))
+  end if
+end do
+evaluator_se(:)=val(:,length)
+
+end function
+
+subroutine printSlist_se(mySexpr)
+type(Sexpression) mySexpr
+integer i,j,k
+k=mySexpr.list_len
+do i=1,k
+write(*,'(a,i3.3,a,a,2i5,2a)') '(',mySexpr.list_id(i),')  ',&
+      trim(mySexpr.list_op(i)),mySexpr.list_pointer(:,i),'  ',trim(mySexpr.list_var(i))
+end do
+end subroutine
+
+
+end module
+
diff --git a/src/SISSO.f90 b/src/SISSO.f90
new file mode 100644
index 0000000..5236333
--- /dev/null
+++ b/src/SISSO.f90
@@ -0,0 +1,578 @@
+! Licensed under the Apache License, Version 2.0 (the "License");
+! you may not use this file except in compliance with the License.
+! You may obtain a copy of the License at
+!
+!     http://www.apache.org/licenses/LICENSE-2.0
+!
+! Unless required by applicable law or agreed to in writing, software
+! distributed under the License is distributed on an "AS IS" BASIS,
+! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+! See the License for the specific language governing permissions and
+! limitations under the License.
+
+
+program SISSO
+
+use var_global
+use libsisso
+use FCse
+use FC
+use DI
+use ifport
+!-------------------
+implicit none
+
+integer i,j,k,l,icontinue,iostatus,line_len
+character tcontinue*2,nsample_line*500,isconvex_line*500,funit_line*500,ops_line*500 !,sysdate*8,systime*10
+logical fexist
+
+call mpi_init(mpierr)
+call mpi_comm_size(mpi_comm_world,mpisize,mpierr)
+call mpi_comm_rank(mpi_comm_world,mpirank,mpierr)
+!---
+
+call random_seed()
+call initialization  ! parameters initialization
+call read_para_a     ! read from SISSO.in
+
+
+fileunit=100
+
+if(mpirank==0) then
+ mytime.sFCDI=mpi_wtime()
+
+! inquire(file='SISSO.out',exist=fexist)
+! call date_and_time(date=sysdate,time=systime)
+! if(fexist) iostatus=rename('SISSO.out','SISSO.out'//sysdate//systime(:6))
+ 
+ open(9,file='SISSO.out',status='replace')
+ write(9,'(a)') '****************************************************************'
+ write(9,'(a)') '  Sure Independence Screening and Sparsifying Operator (SISSO)  '
+ write(9,'(a)') '             Version SISSO.3.5, August, 2024.                '
+ write(9,'(a/)')'****************************************************************'
+end if
+
+allocate(nsample(ntask))  ! regression: number of samples for each task
+allocate(ngroup(ntask,1000))  ! classification: maximally 1000 groups inside each task
+allocate(isconvex(ntask,1000)) ! restricted to be convex domain for each data group?
+allocate(feature_units(nunit,nsf)) ! the units of features
+call read_para_b 
+npoint=sum(nsample)
+
+allocate(target_y(npoint))   ! target property
+allocate(pfdata(npoint,nsf))  ! primary scalar features                                                    
+allocate(pfname(nsf)) ! primary-feature name
+allocate(res(npoint))   ! residual errors
+allocate(ypred(npoint))  ! fitted values at each dimension
+
+line_len=1000+nsf*20 ! maximal line length in train.dat
+call read_data  ! from train.dat 
+if(mpirank==0) call output_para 
+
+
+!---------------------
+! FC and DI starts ...
+!---------------------
+
+if(restart==1) then
+  open(fileunit,file='CONTINUE',status='old')
+     read(fileunit,*) icontinue
+     read(fileunit,'(a)') tcontinue
+     if (tcontinue=='FC') then
+        read(fileunit,*) nf_sis_avai(:icontinue-1)
+     else if (tcontinue=='DI') then
+        read(fileunit,*) nf_sis_avai(:icontinue)
+     end if
+  close(fileunit)
+else
+  icontinue=1
+  tcontinue='FC'
+  if(mpirank==0) then
+     iostatus=delfilesqq('Models/data_top1/*')
+     iostatus=delfilesqq('SIS_subspaces/*')
+     iostatus=delfilesqq('Models/*')
+
+     iostatus=deldirqq('Models/data_top1')
+     iostatus=deldirqq('SIS_subspaces')
+     iostatus=deldirqq('Models')
+
+     iostatus=makedirqq('Models')
+     iostatus=makedirqq('Models/data_top1')
+     iostatus=makedirqq('SIS_subspaces')
+!     iostatus=makedirqq('residual')
+  end if
+end if
+
+! iterations
+do iFCDI=icontinue,desc_dim
+   if(mpirank==0) then
+      write(*,'(/a,i3)') 'Dimension: ',iFCDI
+      write(*,'(a)') '-------------------'
+      write(9,'(/a,i3)') 'Dimension: ',iFCDI
+      write(9,'(a)') '-------------------'
+   end if
+
+   if(iFCDI>icontinue .or. (iFCDI==icontinue .and. tcontinue=='FC') ) then
+     ! run FC
+     if(mpirank==0) then
+         write(*,'(a)') 'Feature Construction (FC) starts ...'
+         write(9,'(a)') 'Feature Construction (FC) starts ...'
+     end if
+
+     if(mpirank==0) call writeCONTINUE('FC')
+     if(iFCDI==1) then
+       res=target_y
+     else
+       if (ptype==1) res=target_y-ypred
+       if (ptype==2) res=ypred
+     end if
+     call mpi_barrier(mpi_comm_world,mpierr)
+     if(fstore==1) then
+         call feature_construction   ! Storing features by data (fast, high-memory demand)
+     else if(fstore==2) then        
+         call feature_construction_se   ! Storing features by S-expression (low-momery demand, slower)
+     end if
+   end if
+
+   ! run DI
+   if(mpirank==0) then
+        write(*,'(/a)') 'Descriptor Identification (DI) starts ...'
+        write(9,'(/a)') 'Descriptor Identification (DI) starts ...'
+   end if
+
+   nf_DI=sum(nf_sis_avai(:iFCDI))  ! SIS-selected features for DI
+   if(mpirank==0) &
+       write(9,'(a,i10)') 'Total number of SIS-selected features for this DI: ',sum(nf_sis_avai(:iFCDI))
+   if(trim(adjustl(method_so))=='L0') then ! number of features for L0
+      nf_L0=nf_DI
+   else if(trim(adjustl(method_so))=='L1L0') then  ! number of features for the L0 from L1
+      nf_L0=L1para.nl1l0
+   end if
+
+   if(mpirank==0) call writeCONTINUE('DI')
+   call mpi_barrier(mpi_comm_world,mpierr)
+   call descriptor_identification
+
+   call flush(9)   !Flushes Fortran unit(s)
+   call flush(6)
+end do
+
+if(mpirank==0) then
+ write(*,'(/a/)') 'SISSO done successfully!'
+ open(1,file='CONTINUE',iostat=iostatus,status='old')
+ if(iostatus==0) close(1,status='delete')
+end if
+
+deallocate(nsample)
+deallocate(ngroup)
+deallocate(isconvex)
+deallocate(feature_units)
+deallocate(target_y)
+deallocate(pfdata)
+deallocate(pfname)
+deallocate(res)
+deallocate(ypred)
+
+call mpi_barrier(mpi_comm_world,mpierr)
+
+if(mpirank==0) then
+   mytime.eFCDI=mpi_wtime()
+   write(9,'(a,f15.2)') 'Total time (second): ',mytime.eFCDI-mytime.sFCDI
+   write(9,'(a/)') 'Have a nice day !    '
+   close(9)
+end if
+
+call mpi_finalize(mpierr)
+
+contains
+
+subroutine prepare4FC
+if(iFCDI==1) then
+  res=target_y
+else
+  ! get the residual
+  if (ptype==1) res=target_y-ypred
+  if (ptype==2) res=ypred   
+end if
+
+end subroutine
+
+
+subroutine writeCONTINUE(AA)
+character AA*2
+2005 format(*(i10))
+   open(1234,file='CONTINUE',status='replace')
+   write(1234,'(i2.2)') iFCDI
+   write(1234,'(a)') AA
+   write(1234,2005) nf_sis_avai(:iFCDI)
+   close(1234)
+end subroutine
+
+!read in parameters from SISSO.in
+subroutine read_para_a
+integer i,j,k,l,ioerr
+character line_short*500        
+
+open(fileunit,file='SISSO.in',status='old')
+do while(.true.)
+   read(fileunit,'(a)',iostat=ioerr) line_short
+   if(ioerr<0) exit
+   if(index(line_short,'!')/=0) line_short(index(line_short,'!'):)=''
+   i=index(line_short,'=')
+   if(i>0) then
+   select case (trim(adjustl(line_short(1:i-1))))
+   case('restart')
+   read(line_short(i+1:),*,err=1001) restart
+   case('nsf')
+   read(line_short(i+1:),*,err=1001) nsf
+   case('ntask')
+   read(line_short(i+1:),*,err=1001) ntask
+   case('task_weighting')
+   read(line_short(i+1:),*,err=1001) task_weighting
+   case('scmt')
+   read(line_short(i+1:),*,err=1001) scmt
+   case('nsample') 
+   read(line_short(i+1:),'(a)',err=1001) nsample_line
+   case('isconvex')
+   read(line_short(i+1:),'(a)',err=1001) isconvex_line
+   case('desc_dim')
+   read(line_short(i+1:),*,err=1001) desc_dim
+   case('funit')
+     read(line_short(i+1:),'(a)',err=1001) funit_line
+     if(index(funit_line(k+1:),'(')>0) nunit=0
+     k=0
+     do while (index(funit_line(k+1:),'(')>0)  ! calculate the nunit
+       k=index(funit_line(k+1:),'(')+k
+       nunit=nunit+1
+     end do
+   case('fmax_min')
+   read(line_short(i+1:),*,err=1001) fmax_min
+   case('fmax_max')
+   read(line_short(i+1:),*,err=1001) fmax_max
+   case('fcomplexity')
+   read(line_short(i+1:),*,err=1001) fcomplexity
+   case('ops')
+   read(line_short(i+1:),'(a)',err=1001) ops_line
+   case('nf_sis')
+   if(index(line_short(i+1:),',')/=0) then
+       read(line_short(i+1:),*,err=1001) nf_sis(:desc_dim) ! set the subspace size for each dimension
+   else 
+       read(line_short(i+1:),*,err=1001) nf_sis(1) ! set the same subspace size for all dimensions
+       nf_sis=nf_sis(1)  
+   end if
+   case('ptype')
+   read(line_short(i+1:),*,err=1001) ptype
+   case('fstore')
+   read(line_short(i+1:),*,err=1001) fstore
+   case('bwidth')
+   read(line_short(i+1:),*,err=1001) bwidth
+   case('nmodel')
+   read(line_short(i+1:),*,err=1001) nmodel
+   case('metric')
+   read(line_short(i+1:),*,err=1001) metric
+   case('method_so')
+   read(line_short(i+1:),*,err=1001) method_so
+   case('fit_intercept')
+   read(line_short(i+1:),*,err=1001) fit_intercept
+   case('L1para.max_iter')
+   read(line_short(i+1:),*,err=1001) L1para.max_iter
+   case('L1para.tole')
+   read(line_short(i+1:),*,err=1001) L1para.tole
+   case('L1para.nlambda')
+   read(line_short(i+1:),*,err=1001) L1para.nlambda
+   case('L1para.dens')
+   read(line_short(i+1:),*,err=1001) L1para.dens
+   case('L1para.minrmse')
+   read(line_short(i+1:),*,err=1001) L1para.minrmse
+   case('L1para.warm_start')
+   read(line_short(i+1:),*,err=1001) L1para.warm_start
+   case('L1para.elastic')
+   read(line_short(i+1:),*,err=1001) L1para.elastic
+   case('L1para.weighted')
+   read(line_short(i+1:),*,err=1001) L1para.weighted
+   end select
+   end if
+end do
+close(fileunit)
+
+if(fcomplexity==0) then
+rung=0
+elseif(fcomplexity==1) then
+rung=1
+elseif(fcomplexity>1 .and. fcomplexity <=3) then
+rung=2
+elseif(fcomplexity>3 .and. fcomplexity<=7 ) then
+rung=3
+elseif(fcomplexity>7 .and. fcomplexity<=15) then
+rung=4
+end if
+
+return
+1001 print *, 'Error: Cannot read the parameter: '//trim(line_short); stop
+
+end subroutine
+
+! read in the data for line nsample, funit, ops, and isconvex
+subroutine read_para_b
+integer*8 i,j,k,kk,l,ll
+
+! nsample
+nsample=0
+ngroup=0
+isconvex=1
+
+if(ptype==1) then
+  read(nsample_line,*) nsample
+else 
+  do ll=1,ntask
+    i=index(nsample_line,'(')
+    j=index(nsample_line,')')
+    l=0
+    do k=i,j
+       if(nsample_line(k:k)==',') l=l+1
+    end do
+    if((i+1)>(j-1)) goto 1002
+    read(nsample_line(i+1:j-1),*,err=1002) ngroup(ll,1:l+1)
+    ngroup(ll,1000)=l+1   ! number of groups
+    nsample(ll)=sum(ngroup(ll,1:l+1))
+    nsample_line(:j)=''
+  end do
+
+  if(index(isconvex_line,'(')/=0) then
+     do ll=1,ntask
+       i=index(isconvex_line,'(')
+       j=index(isconvex_line,')')
+       l=0
+       do k=i,j
+          if(isconvex_line(k:k)==',') l=l+1
+       end do
+       if((i+1)>(j-1)) goto 1003
+       read(isconvex_line(i+1:j-1),*,err=1003) isconvex(ll,1:l+1)
+       isconvex(ll,1000)=ngroup(ll,1000) 
+       isconvex_line(:j)=''
+     end do
+  end if
+end if
+
+! ops
+ops=''    
+if(index(ops_line,'(')/=0) then
+   i=index(ops_line,'=')
+   if(i>0) ops_line(:i)=''
+   if(index(ops_line,',')/=0) then
+       read(ops_line,*,err=10031) ops(:rung)  ! operators for each rung
+   else
+       read(ops_line,*,err=10031) ops(1)  ! same operators for all the rungs
+       ops=ops(1)
+   end if
+end if
+
+! funit
+feature_units=0.d0  ! default
+if(index(funit_line,'(')/=0) then 
+  do ll=1,nunit
+    i=index(funit_line,'(')
+    j=index(funit_line,':')
+    kk=index(funit_line,')')
+    if((i+1)>(j-1)) goto 1004
+    if((j+1)>(kk-1)) goto 1004
+    if(i>0 .and. j>0) then
+      read(funit_line(i+1:j-1),*,err=1004) k
+      read(funit_line(j+1:kk-1),*,err=1004) l
+      feature_units(ll,k:l)=1.d0
+      funit_line(:kk)=''
+    end if
+  end do
+end if
+
+inquire(file='feature_units',exist=fexist) ! detect if the file 'feature_units' exists
+if(fexist) then
+  open(1,file='feature_units',status='old')
+  do i=1,nsf
+     read(1,*) feature_units(:,i)   ! unit of each feature represented by a vector (row)
+  end do  
+  close(1)
+end if
+
+return
+1002 print *, 'Error: Cannot read the parameter "nsample"'; stop
+1003 print *, 'Error: Cannot read the parameter "isconvex"'; stop
+10031 print *, 'Error: Cannot read the parameter "ops"'; stop
+1004 print *, 'Error: Cannot read the parameter "funit"'; stop
+
+end subroutine
+
+
+
+!read in the data from train.dat 
+subroutine read_data
+integer*8 i,j,k,l,mm1,mm2
+character(len=str_len) string_tmp(2+nsf),reactionline(100)*10000,samplename(sum(nsample)),line_verylong*line_len
+real*8 SD(ntask)
+
+ if(mpirank==0) write(9,'(a)') 'Read in data from train.dat'
+
+ open(fileunit,file='train.dat',status='old')
+   read(fileunit,'(a)') line_verylong  ! feature names in the first line
+   call sepchange(line_verylong)
+   call string_split(line_verylong,string_tmp,' ')
+
+   if(ptype==1 ) then
+      pfname=string_tmp(3:2+nsf)
+   else
+      pfname=string_tmp(2:1+nsf)
+   end if
+   do i=1,sum(nsample)
+       read(fileunit,'(a)') line_verylong   ! data
+       call sepchange(line_verylong)
+       line_verylong=adjustl(line_verylong)
+       j=index(line_verylong,' ')
+       samplename(i)=line_verylong(:j)
+       line_verylong=line_verylong(j:)
+       if(ptype==1 ) then
+         read(line_verylong,*,err=1005) target_y(i),pfdata(i,:)
+       else  ! ptype==2 
+         read(line_verylong,*,err=1005) pfdata(i,:)   ! no y values in the train.dat file for classification
+         target_y(i)=0.d0    ! 0 denote unclassified
+       end if
+   end do
+ close(fileunit)
+
+ 1999 format(a,i3.3,a,*(f10.5))
+ if(mpirank==0 .and. ptype==1) then
+   do j=1,ntask
+    mm1=sum(nsample(:j-1))+1
+    mm2=sum(nsample(:j))
+    SD(j)=sqrt(sum((target_y(mm1:mm2)-sum(target_y(mm1:mm2))/(mm2-mm1+1))**2)/(mm2-mm1+1))   ! population SD
+    if(ntask==1) then
+       write(9,'(a,f10.5)') 'Standard Deviation (sqrt[sum(y-y_mean)^2/n]) of the target property:',SD(j)
+    else
+       write(9,1999) 'Standard Deviation (sqrt[sum(y-y_mean)^2/n]) of the target property of task ',j,': ',SD(j)
+    end if
+   end do
+ end if
+
+return
+
+1005 print *, 'Error while reading the file "train.dat", at: '//trim(line_verylong); stop
+1007 print *, 'Error while reading the file "reaction.dat", at: '//trim(line_verylong); stop
+
+end subroutine
+
+! converting the separator in train.dat from Tab, comma, to space
+subroutine sepchange(line)
+character(len=*) line
+do while (index(line,char(9))/=0)   ! separator TAB to space
+ line(index(line,char(9)):index(line,char(9)))=' '
+end do
+do while (index(line,',')/=0)    ! separator comma to space
+ line(index(line,','):index(line,','))=' '
+end do
+end subroutine
+
+
+subroutine initialization
+ptype=1               
+ntask=1               
+scmt=.false.          
+desc_dim=2           
+nsample=5             
+nsf= 5                
+fcomplexity=3         
+nunit=1
+nf_sis=50000
+fstore=1
+fmax_min=1e-3         
+fmax_max=1e5          
+restart=0            
+method_so='L0'       
+fit_intercept=.true.  
+metric='RMSE'         
+nmodel=100          
+bwidth=0.001         
+nf_sis_avai=0 
+task_weighting=1      
+!---------------------
+L1para.max_iter=1e6         ! Max iteration for LASSO (given a lambda) to stop
+L1para.tole=1e-6            ! Convergence criteria for LASSO to stop
+L1para.dens=120             ! Density of lambda grid = number of points in [0.001*max,max]
+L1para.nlambda=1e3          ! Max number of lambda points
+L1para.minrmse=1e-3         ! Min RMSE for the LASSO to stop
+L1para.warm_start=.true.    ! Using previous solution for the next step
+L1para.nl1l0= 30            ! Number of features for L0 from L1
+L1para.weighted=.false.       ! Weighted learning for L1 (provide file lasso.weight if yes)
+!----------------------
+
+end subroutine
+
+
+subroutine output_para
+!----------------------
+! output the parameters
+!----------------------
+2001   format(a,*(i6))
+2002   format(*(f8.2))
+2003   format(a,i3.3,a,*(i5))
+2004   format(*(a))
+   write(9,'(a)') 'Read in data from SISSO.in'
+   write(9,'(a,i3)') 'Property type:   ',ptype
+   write(9,'(a,i8)') 'Number of tasks: ',ntask
+   write(9,'(a,i8)') 'Descriptor dimension: ',desc_dim
+   write(9,2001)  'Number of samples for the task(s): ',nsample
+   write(9,'(a,i3)') 'Restarts :',restart
+
+   if(ptype==1) then
+      if(ntask>1) then
+        write(9,'(a,i8)') 'Task_weighting: ',task_weighting
+        write(9,'(a,l6)') 'Sign-constrained multi-task learning: ',scmt
+      end if
+   end if
+
+   if(ptype==2) then
+     do i=1,ntask
+      write(9,2003) 'Number of samples in each category of the task ',i,': ',ngroup(i,:ngroup(i,1000))
+      write(9,2003) 'Convexity of the domains of the task ',i,': ',isconvex(i,:isconvex(i,1000))
+     end do
+     write(9,'(a,f10.6)') 'Domain-boundary tolerance: ',bwidth
+   end if
+
+   write(9,'(a,i8)') 'Scheme for feature storing in memory: ',fstore
+   write(9,'(a,i8)') 'Number of scalar features: ',nsf
+   write(9,'(a,i8)')  'Tier of the feature space (max depth of expression tree): ',rung
+   write(9,'(a,i8)')  'Maximal feature complexity (number of operators in a feature): ',fcomplexity
+   write(9,'(a)') 'Unit of input primary feature, each represented by a row vector: '
+   do i=1,nsf
+     write(9,2002) feature_units(:,i)
+   end do
+   write(9,'(a,e15.5)') 'The feature will be discarded if the minimum of the maximal abs. value in it <',fmax_min
+   write(9,'(a,e15.5)') 'The faature will be discarded if the maximum of the maximal abs. value in it > ',fmax_max
+   write(9,2001) 'Size of the SIS-selected (single) subspace : ',nf_sis(:desc_dim)
+   write(9,2004)  'Operators for feature construction: ',(trim(ops(j)),' ',j=1,rung)
+
+   write(9,'(a,a)') 'Method for sparse regression:  ',method_so
+   if(ptype==1) then
+     write(9,'(a,l6)') 'Fitting intercept: ',fit_intercept
+     write(9,'(a,a)')  'Metric for model selection: ',trim(metric)
+     if(trim(adjustl(method_so))=='L1L0') then
+       write(9,'(a,i10)') 'Max iterations for LASSO (with given lambda) to stop: ',L1para.max_iter
+       write(9,'(a,e20.10)') 'Convergence criterion for LASSO: ',L1para.tole
+       write(9,'(a,i8)') 'Number of lambda trial: ',L1para.nlambda
+       write(9,'(a,i8)') 'Density of lambda points: ',L1para.dens
+       write(9,'(a,e20.10)') 'Minimal RMSE for LASSO to stop: ',L1para.minrmse
+       write(9,'(a,l6)') 'Warm start?  ',L1para.warm_start
+       write(9,'(a,e20.10)') 'Elastic net: ',L1para.elastic
+       write(9,'(a,l6)') 'Weighted LASSO (file lasso.weight required)? ',L1para.weighted
+     end if
+     if(trim(adjustl(method_so))=='L1L0') then
+         write(9,'(a,i8)') 'Number of selected features by L1 for L0 in L1L0:', L1para.nl1l0
+     end if
+   end if
+
+   write(9,'(a,i8)') 'Number of the top-ranked models to output: ',nmodel
+   write(9,'(a)') '--------------------------------------------------------------------------------'
+
+end subroutine
+
+end program
+
+
diff --git a/src/libsisso.f90 b/src/libsisso.f90
new file mode 100644
index 0000000..175a152
--- /dev/null
+++ b/src/libsisso.f90
@@ -0,0 +1,1842 @@
+! Licensed under the Apache License, Version 2.0 (the "License");
+! you may not use this file except in compliance with the License.
+! You may obtain a copy of the License at
+!
+!     http://www.apache.org/licenses/LICENSE-2.0
+!
+! Unless required by applicable law or agreed to in writing, software
+! distributed under the License is distributed on an "AS IS" BASIS,
+! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+! See the License for the specific language governing permissions and
+! limitations under the License.
+
+
+module libsisso
+!***********************************************************************
+! General subroutines/functions used by SISSO:
+! ----------
+! det: determinant of a given matrix
+! inverse: inverse of a given matrix
+! orth_de: linear least square fitting by orthogonal decomposition
+! worth_de: observation weighted orth_de
+! lls: linear least fitting by conventional method (not robust)
+! coord_descent: coordinate descent
+! sc_coord_descent: sign-constraint coordinate descent
+! crosspro: crossproduct between two vectors
+! LASSO: least absolute shrinkage and selection operator
+! mtlasso_mpi Multi-task LASSO
+! corr:  pearson's correlation
+! string_split: breaking a string into sub-strings.
+! kfoldCV: k-fold CV for linear model
+! convex2d_xxx: construction of 2D convex hull
+! intriangle: checking if a 3D point is inside a triangle
+! convex3d_xxx: construction of 3D convex hull
+! interlp: intersection between a line and a plane
+! dispp: distance from a point to a plane
+!************************************************************************
+
+use mpi
+
+contains
+
+
+
+
+function dispp(p,a,b,c)
+! Distance of a point from a plane
+! input: a,b,c are the three points determing the plane, p is the point outside the plane.
+! output: the distance
+
+real*8 dispp,a(3),b(3),c(3),p(3),normal(3)
+normal=crosspro((a-b),(c-b))
+dispp=abs(sum(normal*(p-a)))/sqrt(sum(normal**2))
+
+end function
+
+function interlp(p1,p2,p3,p4,p5)
+! intersection between a line and a plane
+! input: p1,p2,p3 for determing the plane, p4 and p5 determing the line
+! output: the interception point
+
+real*8 interlp(3),p1(3),p2(3),p3(3),p4(3),p5(3),normal(3),a,b,c,d,&
+       m,n,p,t
+! for plane
+normal=crosspro((p1-p2),(p3-p2))
+a=normal(1)
+b=normal(2)
+c=normal(3)
+d=-(a*p1(1)+b*p1(2)+c*p1(3))  ! aX+bY+cZ+d=0 (X,Y,Z) is the normal
+! for line
+m=p5(1)-p4(1)
+n=p5(2)-p4(2)
+p=p5(3)-p4(3)  !line equation: (x-x0)/m=(y-y0)/n=(z-z0)/p=t
+! intersection
+t=-(a*p4(1)+b*p4(2)+c*p4(3)+d)/(m*a+n*b+p*c) ! X=mt+x0; Y=nt+y0; Z=pt+z0
+interlp(1)=m*t+p4(1)
+interlp(2)=n*t+p4(2)
+interlp(3)=p*t+p4(3)
+end function
+
+
+function intriangle(p,a,b,c)
+! check if point p is inside the triangle formed by points a,b,c
+! p,a,b,c are in the same plane
+real*8 a(3),b(3),c(3),p(3),v1(3),v2(3),normal_0(3),normal_1(3),normal_2(3),normal_3(3)
+logical intriangle
+intriangle=.false.
+v1=a-b
+v2=c-b
+normal_0=crosspro_abnormalized(v1,v2) ! triangle normal
+v1=p-b
+v2=c-b
+normal_1=crosspro_abnormalized(v1,v2)
+v1=p-c
+v2=a-c
+normal_2=crosspro_abnormalized(v1,v2)
+v1=p-a
+v2=b-a
+normal_3=crosspro_abnormalized(v1,v2)
+if(sum(normal_0*normal_1)>=0.d0 .and. sum(normal_0*normal_2)>=0.d0 .and. sum(normal_0*normal_3)>=0.d0) intriangle=.true.
+end function
+
+
+subroutine string_split(instr,outstr,sp)
+! break a string into sub-strings
+! input: instr, string; sp, separator
+! output: outstr, sub-strings
+character(len=*) instr,outstr(:),sp
+integer n
+logical isend
+
+isend=.false.
+n=0
+outstr=''
+
+i=index(instr,sp)
+if(i/=0) then
+  if(i/=1) then
+    n=n+1
+    outstr(n)=instr(1:i-1)
+  end if
+  do while ((.not. isend) .and. n<ubound(outstr,1))
+    if ((i+len(sp))<len(instr)) then
+        j=index(instr(i+len(sp):),sp)
+        if(j==0) then
+          isend=.true.
+          n=n+1
+          outstr(n)=instr(i+len(sp):)
+        else if(j>1) then
+          n=n+1
+          outstr(n)=instr(i+len(sp):i+len(sp)-1+j-1)
+        end if
+        i=i+len(sp)+j-1
+     else
+       isend=.true.
+     end if
+  end do
+end if
+
+end subroutine
+
+
+function corr(x,y)
+! pearson's correlation
+! input: vector x and y
+! output: the correlation coefficient
+real*8 x(:),y(:),meanx,meany,sigmax,sigmay,m,corr
+m=size(x)
+
+meanx=sum(x)/m
+meany=sum(y)/m
+sigmax=sqrt(sum((x-meanx)**2)/m)
+sigmay=sqrt(sum((y-meany)**2)/m)
+
+corr=sum((x-meanx)*(y-meany))/(m*sigmax*sigmay)
+end function
+
+function det(mat)
+! LU docomposition for sovling determinant of a matrix
+! input: matrix mat
+! output: the determinant.
+
+integer i,j,k,n
+real*8 mat(:,:),um(ubound(mat,1),ubound(mat,1)),s,det,temp(ubound(mat,1))
+s=0
+um=mat
+n=ubound(mat,1)
+
+do i=1,n
+do j=i+1,n
+
+if (um(i,i)==0.d0) then
+  do k=i+1,n
+    if(um(k,i) /=0.d0) then
+      temp=um(i,:)
+      um(i,:)=um(k,:)
+      um(k,:)=-1*temp
+      exit
+    end if
+  end do
+end if
+if (um(i,i)==0.d0) then
+det=0;return
+end if
+
+um(j,:)=um(j,:)-um(j,i)/um(i,i)*um(i,:)
+end do
+end do
+
+do i=1,n
+if(i==1) then
+s=um(i,i)
+else
+s=s*um(i,i)
+endif
+end do
+
+det=s
+end function
+
+function inverse(mat)
+! calculate the inverse of a given matrix
+! input: the matrix mat
+
+real*8  mat(:,:),um(ubound(mat,1),ubound(mat,1)),lm(ubound(mat,1),ubound(mat,1))
+real*8  x(ubound(mat,1),ubound(mat,1)),y(ubound(mat,1),ubound(mat,1)),inverse(ubound(mat,1),ubound(mat,1))
+integer i,j,k,n
+um=mat
+n=ubound(mat,1)
+lm=0;x=0;y=0
+if (abs(det(mat))==0) then
+  print *, 'Error: Value of the determinant is 0, the matrix is singular'; stop
+end if
+if (abs(det(mat))<1d-30) write(*,'(a,E15.6E3,a)') 'Warning: Value of the determinant is:',&
+det(mat),' matrix might be singular'
+
+! construct up triangle matrix
+do i=1,n
+do j=i+1,n
+um(j,:)=um(j,:)-um(j,i)/um(i,i)*um(i,:)
+end do
+end do
+
+! construct low triangle matrix
+lm(1,1)=mat(1,1)/um(1,1)
+do i=2,n
+do j=1,i
+s=sum(lm(i,1:(i-1))*um(1:(i-1),j))
+lm(i,j)=(mat(i,j)-s)/um(j,j)
+end do
+end do
+
+! construct y matrix
+y(1,1)=1/lm(1,1)
+do i=2,n
+do j=1,i
+s=sum(lm(i,1:(i-1))*y(1:(i-1),j))
+if(j/=i) y(i,j)=-s/lm(i,i)
+if(j==i) y(i,j)=(1-s)/lm(i,i)
+end do
+end do
+
+! construct x matrix, which is also the inverse matrix
+do i=1,n
+x(n,i)=y(n,i)/um(n,n)
+end do
+do i=n-1,1,-1
+do j=1,n,1
+s=sum(um(i,i+1:n)*x(i+1:n,j))
+x(i,j)=(y(i,j)-s)/um(i,i)
+end do
+end do
+
+inverse=x
+
+end function
+
+subroutine qr_de(a,q,r)
+! QR decomposition of input matrix a
+! https://en.wikipedia.org/wiki/QR_decomposition
+integer i,j,k,m,n
+real*8 a(:,:),q(ubound(a,1),ubound(a,2)),r(ubound(a,2),ubound(a,2))
+real*8 u(ubound(a,1),ubound(a,2)),e(ubound(a,1),ubound(a,2))
+m=ubound(a,1)
+n=ubound(a,2)
+u=0
+e=0
+
+do i=1,n
+if(i==1) then
+u(:,1)=a(:,1)
+else
+u(:,i)=a(:,i)
+do j=1,i-1
+u(:,i)=u(:,i)-dot_product(u(:,j),a(:,i))/dot_product(u(:,j),u(:,j))*u(:,j)
+end do
+end if
+e(:,i)=u(:,i)/sqrt(dot_product(u(:,i),u(:,i)))
+end do
+q=e
+
+do j=1,n
+do i=1,j
+r(i,j)=sum(e(:,i)*a(:,j))
+end do
+
+do k=j+1,n
+r(k,j)=0.0
+end do
+end do
+end subroutine
+
+
+subroutine orth_de(x,y,intercept,beta,rmse)
+! linear least square fit to y=a+x*b by orthogonal decomposition
+! input: matrix x,vector y; 
+! output intercept,beta,rmse
+! https://en.wikipedia.org/wiki/Linear_least_squares_(mathematics)
+
+real*8  x(:,:),y(:),beta(ubound(x,2)),xprime(ubound(x,1),ubound(x,2)),yprime(ubound(y,1)),intercept,rmse
+real*8  q(ubound(x,1),ubound(x,2)),r(ubound(x,2),ubound(x,2)),qty(ubound(x,2)),xmean(ubound(x,2)),ymean
+integer i,j,k,m,n
+
+m=ubound(x,1)
+n=ubound(x,2)
+do i=1,n
+xmean(i)=sum(x(:,i))/m
+xprime(:,i)=x(:,i)-xmean(i)
+end do
+ymean=sum(y)/m
+yprime=y-ymean
+
+call qr_de(xprime,q,r) ! results stored in q and r
+qty=matmul(transpose(q),yprime)
+beta(n)=qty(n)/r(n,n)
+
+do i=n,1,-1
+beta(i)=qty(i)
+do j=i+1,n
+beta(i)=beta(i)-r(i,j)*beta(j)
+end do
+beta(i)=beta(i)/r(i,i)
+end do
+
+rmse=sqrt(sum((yprime-matmul(xprime,beta))**2)/m)
+intercept=ymean-sum(xmean*beta)
+
+end subroutine
+
+subroutine orth_de_nointercept(x,y,beta,rmse)
+! linear least square fit to y=x*b by orthogonal decomposition
+! input: matrix x,vector y; 
+! output beta,rmse
+
+real*8 x(:,:),y(:),beta(ubound(x,2)),rmse
+real*8 q(ubound(x,1),ubound(x,2)),r(ubound(x,2),ubound(x,2)),qty(ubound(x,2))
+integer i,j,k,m,n
+
+m=ubound(x,1)
+n=ubound(x,2)
+
+call qr_de(x,q,r) ! results stored in q and r
+qty=matmul(transpose(q),y)
+beta(n)=qty(n)/r(n,n)
+
+do i=n,1,-1
+beta(i)=qty(i)
+do j=i+1,n
+beta(i)=beta(i)-r(i,j)*beta(j)
+end do
+beta(i)=beta(i)/r(i,i)
+end do
+
+rmse=sqrt(sum((y-matmul(x,beta))**2)/m)
+
+end subroutine
+
+
+
+subroutine worth_de(x,y,weight,intercept,beta,rmse,wrmse)
+! weighted linear least sqaure
+implicit none
+real*8  x(:,:),y(:),beta(ubound(x,2)),xprime(ubound(x,1),ubound(x,2)),yprime(ubound(y,1)),intercept,wrmse,rmse,&
+q(ubound(x,1),ubound(x,2)),r(ubound(x,2),ubound(x,2)),qty(ubound(x,2)),xmean(ubound(x,2)),ymean,weight(:),&
+wx(ubound(x,1),ubound(x,2)),wy(ubound(y,1))
+integer i,j,k,m,n
+
+m=ubound(x,1)
+n=ubound(x,2)
+
+do i=1,n
+xmean(i)=sum(x(:,i))/m
+xprime(:,i)=x(:,i)-xmean(i)
+wx(:,i)=xprime(:,i)*sqrt(weight)
+end do
+ymean=sum(y)/m
+yprime=y-ymean
+wy=yprime*sqrt(weight)
+
+
+call qr_de(wx,q,r) ! results stored in q and r
+qty=matmul(transpose(q),wy)
+beta(n)=qty(n)/r(n,n)
+
+do i=n,1,-1
+beta(i)=qty(i)
+do j=i+1,n
+beta(i)=beta(i)-r(i,j)*beta(j)
+end do
+beta(i)=beta(i)/r(i,i)
+end do
+
+wrmse=sqrt(sum(weight*(yprime-matmul(xprime,beta))**2)/m)
+rmse=sqrt(sum((yprime-matmul(xprime,beta))**2)/m)
+intercept=ymean-sum(xmean*beta)
+
+end subroutine
+
+
+subroutine lls(x,y,intercept,beta,rmse)
+! linear least square fit by the standard method
+! input: matrix x, vector y
+! output: beta,rmse, intercept
+! https://en.wikipedia.org/wiki/Linear_least_squares_(mathematics)
+implicit none
+real*8  x(:,:),y(ubound(x,1)),beta0(ubound(x,2)+1),rmse,x0(ubound(x,1),ubound(x,2)+1),&
+        beta(ubound(x,2)),intercept
+integer m,n
+m=ubound(x,1)
+n=ubound(x,2)
+
+x0(:,1)=1
+x0(:,2:n+1)=x
+
+beta0=matmul(inverse(matmul(transpose(x0),x0)),matmul(transpose(x0),y))
+rmse=sqrt(sum((y-matmul(x0,beta0))**2)/m)
+intercept=beta0(1)
+beta=beta0(2:n)
+
+end subroutine
+
+
+function crosspro(a,b)
+! calculate the cross product: a x b
+! input: vector a and b
+! output: a vector
+real*8  a(3),b(3),crosspro(3)
+crosspro(1)=a(2)*b(3)-a(3)*b(2);
+crosspro(2)=a(3)*b(1)-a(1)*b(3);
+crosspro(3)=a(1)*b(2)-a(2)*b(1);
+end function
+
+function crosspro_abnormalized(a,b)
+! a and b are normalized before calculating their cross product
+real*8  a(3),b(3),aa(3),bb(3),crosspro_abnormalized(3),norma,normb
+aa=a
+bb=b
+norma=sqrt(sum(aa**2))
+normb=sqrt(sum(bb**2))
+if(norma/=0.d0 .and. normb/=0.d0) then
+  aa=aa/norma
+  bb=bb/normb
+else
+  crosspro_abnormalized=0.d0
+end if
+
+crosspro_abnormalized(1)=aa(2)*bb(3)-aa(3)*bb(2);
+crosspro_abnormalized(2)=aa(3)*bb(1)-aa(1)*bb(3);
+crosspro_abnormalized(3)=aa(1)*bb(2)-aa(2)*bb(1);
+
+end function
+
+
+subroutine lasso(prod_xty,prod_xtx,lambda,max_iter,tole,beta_init,run_iter,beta,nf)
+! min: f(beta)=1/2*||y-x*beta||**2 + lambda*||beta||_l1
+! prod_xty=XtY; prod_xtx=XtX; max_iter:allowed max cycle; 
+! tole: convergence criteria for cd to stop
+! beta_init: initial coefficient
+! run_iter, actual run cycles; nf: number of selected features; 
+! input prod_xty,prod_xtx,lambda,max_iter,tole,beta_init; 
+! output run_iter,beta, nf
+! (LASSO) J. Friedman, T. Hastie, R. Tibshirani, J. Stat. Softw. 33, 1(2010).
+! (LASSO) J. Friedman, T. Hastie, H. Hofling, R. Tibshirani, Ann. Appl. Stat. 1, 302 (2007).
+implicit none
+real*8 prod_xty(:),prod_xtx(:,:),lambda,beta(:),beta_init(:),&
+beta_old(ubound(beta,1)),z,tole,rand(ubound(beta,1)),beta_tmp,dbeta,xxbeta(ubound(beta,1))
+integer ntotf,i,i1,i2(1),j,k,max_iter,run_iter,nf,ac1(ubound(beta,1)),ac2(ubound(beta,1))
+logical active_change
+
+ntotf=ubound(prod_xtx,1)
+
+beta_old=beta_init
+beta=beta_init
+xxbeta=matmul(prod_xtx,beta)
+
+!-------------------------
+! entering the main loop
+!-------------------------
+
+do i=1,max_iter
+
+   if(i==1) then
+      active_change=.true.
+      ac1=0; ac2=0
+   end if
+
+  do j=1,ntotf
+   if((.not. active_change) .and. abs(beta(j))<1d-10) then
+     cycle
+   else
+       beta_tmp=beta(j)
+       z=prod_xty(j)-xxbeta(j)+beta(j)
+      if(z>0 .and. lambda<abs(z)) then
+       beta(j)=z-lambda
+      else if(z<0 .and. lambda<abs(z)) then
+       beta(j)=z+lambda
+      else if(lambda>=abs(z)) then
+       beta(j)=0
+      end if
+      xxbeta=xxbeta-prod_xtx(:,j)*beta_tmp
+      xxbeta=xxbeta+prod_xtx(:,j)*beta(j)
+   end if
+  end do   
+  active_change=.false. 
+
+  if(i==max_iter) print *, 'Warning: Lasso run hits the max_iter.'
+  if(maxval(abs(beta-beta_old))<tole .or. i==max_iter) then
+    ! check if the active set converge
+     do k=1,ntotf
+        if(abs(beta(k))>1d-10) then
+          ac2(k)=1
+        else
+          ac2(k)=0
+        end if
+     end do
+
+     do k=1,ntotf
+       if(ac1(k)/=ac2(k)) then
+         ac1=ac2
+         active_change=.true.
+         exit
+       end if
+     end do
+     if((.not. active_change)) exit
+  else
+    beta_old=beta
+  end if
+
+end do
+
+nf=sum(ac2)
+run_iter=i
+
+end subroutine
+
+subroutine mtlasso_mpi(prod_xty,prod_xtx,lambda,max_iter,tole,beta_init,run_iter,beta,nf,ncol)
+! prod_xty=XtY; prod_xtx=XtX; max_iter:allowed max cycle; 
+! tole: convergence criteria for cd to stop
+! beta_init: initial coefficient
+! run_iter, actual run cycles; nf: number of selected features; 
+! ncol: mpi jobs assignment
+! output run_iter,beta, nf
+! (LASSO) J. Friedman, T. Hastie, R. Tibshirani, J. Stat. Softw. 33, 1(2010).
+! (LASSO) J. Friedman, T. Hastie, H. Hofling, R. Tibshirani, Ann. Appl. Stat. 1, 302 (2007).
+! (MTLASSO) G. Obozinski, B. Taskar, M. Jordan, 2006 "Multi-task feature selection"
+! multi-task lasso return back to lasso when the number of task is one
+! algorithm for sovling multi-task lasso can be found from that of group lasso
+! (GLASSO) M. Yuan and Y. Lin, J. R. Statist. Soc. B 68,49(2006)
+! (GLASSO) J. Friedman, T. Hastie, and R. Tibshirani, 2010 "A note on the group lasso and a sparse group lasso"
+! (Elastic net) H. Zou, and T. Hastie, J. R. Statist. Soc. B 67, 301 (2005).
+
+real*8 prod_xty(:,:),prod_xtx(:,:,:),lambda,beta(:,:),beta_init(:,:),beta_old(ubound(beta,1),ubound(beta,2)),tole,&
+beta_tmp(ubound(beta,2)),beta_tmp2(ubound(beta,2)),dbeta,xxbeta(ubound(beta,1),ubound(beta,2)),&
+Sj(ubound(beta,2)),norm
+integer ntotf,i,j,k,max_iter,run_iter,nf,ac1(ubound(beta,1)),ac2(ubound(beta,1)),ntask,mpii,mpij,mpik,mpin,ncol(:)
+logical active_change
+!----
+
+ntask=ubound(prod_xty,2)
+ntotf=ubound(prod_xty,1)
+beta_old=beta_init
+beta=beta_init
+
+! calculate xtx*beta
+do mpij=1,ncol(mpirank+1)
+do mpik=1,ntask
+  xxbeta(mpij,mpik)=sum(prod_xtx(:,mpij,mpik)*beta(:,mpik))
+end do
+end do
+
+! rank0 collect data and broadcast
+mpin=ncol(mpirank+1)
+if (mpirank /=0) then
+call mpi_send(xxbeta(1:mpin,:),mpin*ntask,mpi_double_precision,0,1,mpi_comm_world,mpierr)
+else
+do mpii=1,mpisize-1
+mpij=sum(ncol(1:mpii))
+mpik=ncol(mpii+1)
+call mpi_recv(xxbeta(mpij+1:mpij+mpik,:),mpik*ntask,mpi_double_precision,mpii,1,mpi_comm_world,status,mpierr)
+end do
+end if
+
+call mpi_barrier(mpi_comm_world,mpierr)
+call mpi_bcast(xxbeta,ntotf*ntask,mpi_double_precision,0,mpi_comm_world,mpierr)
+
+!------------------------
+! entering the main loop
+!------------------------
+do i=1,max_iter
+
+   if(i==1) then
+      active_change=.true.
+      ac1=0; ac2=0
+   end if
+
+  if(mpirank/=0) then  ! do the job processor after processor,core0->core1->core3...
+   call mpi_recv(xxbeta,ntotf*ntask,mpi_double_precision,mpirank-1,1,mpi_comm_world,status,mpierr)
+  end if
+
+  do j=1+sum(ncol(1:mpirank)),sum(ncol(1:mpirank+1))
+
+    if((.not. active_change) .and. maxval(abs(beta(j,:)))<1d-10) then
+     cycle
+    else
+       ! Multi-task lasso and group lasso share the similar solution
+       ! see paper "A note on the group lasso and a sparse group lasso" by J. Friedman, et al., 2010
+       ! and also note that features are the same for all responses
+       ! if ntask==1, back to lasso automatically.
+       Sj=prod_xty(j,:)-xxbeta(j,:)+prod_xtx(j,j-sum(ncol(1:mpirank)),:)*beta(j,:) 
+       norm=sqrt(sum(Sj**2))
+       beta_tmp=beta(j,:)
+       if(norm<=lambda) then
+         beta(j,:)=0.0
+       else
+         ! only when features are orthogonal
+         !beta(j,:)=Sj*(1-lambda/norm)
+
+         ! do it iteratively when features are not orthogonal
+           beta_tmp2=beta(j,:)+1
+           norm=sqrt(sum((beta(j,:))**2))
+           if(norm<1d-10) norm=1.0
+           do while(maxval(abs(beta(j,:)-beta_tmp2))>1d-10)
+             beta_tmp2=beta(j,:)
+             beta(j,:)=Sj/(prod_xtx(j,j-sum(ncol(1:mpirank)),:)+lambda/norm)
+             norm=sqrt(sum((beta(j,:))**2))
+           end do
+         !--
+       end if
+       do k=1,ntask
+       xxbeta(:,k)=xxbeta(:,k)-prod_xtx(:,j-sum(ncol(1:mpirank)),k)*beta_tmp(k)
+       xxbeta(:,k)=xxbeta(:,k)+prod_xtx(:,j-sum(ncol(1:mpirank)),k)*beta(j,k)
+       end do
+   end if
+
+ end do
+
+
+ if(mpirank/=mpisize-1) then 
+ call mpi_send(xxbeta,ntotf*ntask,mpi_double_precision,mpirank+1,1,mpi_comm_world,mpierr)
+ end if
+! core0->core1, core1->core2, core2->core3, ..., core(n-1)->core(n)
+
+ call mpi_barrier(mpi_comm_world,mpierr)
+ call mpi_bcast(xxbeta,ntotf*ntask,mpi_double_precision,mpisize-1,mpi_comm_world,mpierr)
+
+  mpin=ncol(mpirank+1)
+  if (mpirank /=0) then
+  call mpi_send(beta(1+sum(ncol(1:mpirank)):sum(ncol(1:mpirank+1)),:),ncol(mpirank+1)*ntask,&
+                     mpi_double_precision,0,1,mpi_comm_world,mpierr)
+  else
+  do mpii=1,mpisize-1
+  mpij=sum(ncol(1:mpii))
+  mpik=ncol(mpii+1)
+  call mpi_recv(beta(mpij+1:mpij+mpik,:),mpik*ntask,mpi_double_precision,mpii,1,mpi_comm_world,status,mpierr)
+  end do
+  end if
+  call mpi_bcast(beta,ntotf*ntask,mpi_double_precision,0,mpi_comm_world,mpierr)
+  call mpi_barrier(mpi_comm_world,mpierr)
+
+  active_change=.false.
+
+  if(i==max_iter) print *, 'Warning: Lasso run hits the max_iter.'
+  if(maxval(abs(beta-beta_old))<tole .or. i==max_iter) then
+    ! check if the active set converge
+     do k=1,ntotf
+        if(maxval(abs(beta(k,:)))>1d-10) then
+          ac2(k)=1
+        else
+          ac2(k)=0
+        end if
+     end do
+
+     do k=1,ntotf
+       if(ac1(k)/=ac2(k)) then
+         ac1=ac2
+         active_change=.true.
+         exit
+       end if
+     end do
+     if((.not. active_change)) exit
+  else
+    beta_old=beta
+  end if
+
+end do
+
+nf=sum(ac2)
+run_iter=i
+
+end subroutine
+
+subroutine sc_coord_descent(x,y,max_iter,tole,dd,intercept,beta,rmse)
+! sign-constrained coordinate descent method to solve y=a+x*b
+! input: matrix x,vector y,max iteration:  max_iter,tolerance: tole, dimension: dd
+! output: intercept,beta,rmse
+
+implicit none
+integer m,n,i,j,k,max_iter,dd
+real*8 x(:,:),y(ubound(x,1)),beta(ubound(x,2),2**dd),beta_old(ubound(x,2),2**dd),xprime(ubound(x,1),ubound(x,2)),&
+xdprime(ubound(x,1),ubound(x,2)),yprime(ubound(y,1)),intercept(2**dd),rmse(2**dd),tole,norm_beta,&
+prod_xty(ubound(x,2)),prod_xtx(ubound(x,2),ubound(x,2)),xpnorm(ubound(x,2)),ymean,xmean(ubound(x,2))
+integer csign(ubound(x,2),2**dd)
+
+m=ubound(x,1)
+n=ubound(x,2)
+
+! enumerate all coefficients signs
+csign=1
+do i=n,1,-1
+  if(i==n) then
+    csign(i,2)=-1
+  else
+    csign(i,2**(n-i)+1:2**(n-i+1))=-1
+    csign((i+1):n,2**(n-i)+1:2**(n-i+1))=csign((i+1):n,1:2**(n-i))
+  end if
+end do
+
+! yprime= y-ymean
+! xprime=x-xmean
+! xdprime=xprime/xpnorm
+! intercept=ymean-sum(xmean*beta)
+! argmin||y-x*beta-intercept|| -> argmin||yprime-xprime*beta||
+! beta=beta/xpnorm
+
+do i=1,n
+ xmean(i)=sum(x(:,i))/m
+ xprime(:,i)=x(:,i)-xmean(i)
+ xpnorm(i)=sqrt(sum((xprime(:,i))**2))
+ xdprime(:,i)=xprime(:,i)/xpnorm(i)
+end do
+ymean=sum(y)/m
+yprime=y-ymean
+
+
+beta_old=0.0
+beta=0.0
+
+prod_xty=matmul(transpose(xdprime),yprime)
+prod_xtx=matmul(transpose(xdprime),xdprime)
+
+do k=1,2**dd  ! all possibility of coefficients signs
+  do i=1,max_iter
+   do j=1,n
+         beta(j,k)=prod_xty(j)-sum(prod_xtx(j,:)*beta(:,k))+beta(j,k)
+         if(beta(j,k)*csign(j,k)<0.d0) beta(j,k)=0.d0   ! sign-constrained
+   end do
+     norm_beta=sqrt(sum((beta(:,k))**2))
+    if(norm_beta==0 .or. (sqrt(sum((beta(:,k)-beta_old(:,k))**2))/norm_beta)<tole .or. i==max_iter) then
+      exit
+    else
+      beta_old(:,k)=beta(:,k)
+    end if
+  end do
+  rmse(k)=sqrt(sum((yprime-matmul(xdprime,beta(:,k)))**2)/m)
+  beta(:,k)=beta(:,k)/xpnorm
+  intercept(k)=ymean-sum(xmean*beta(:,k))
+end do
+
+end subroutine
+
+
+subroutine sc_coord_descent_nointercept(x,y,max_iter,tole,dd,beta,rmse)
+! sign-constrained coordinate descent method to solve y=x*b
+! input: matrix x,vector y,max iteration:  max_iter,tolerance: tole, dimension: dd
+! output: beta,rmse
+
+implicit none
+integer m,n,i,j,k,max_iter,dd
+real*8 x(:,:),y(ubound(x,1)),beta(ubound(x,2),2**dd),beta_old(ubound(x,2),2**dd),xprime(ubound(x,1),ubound(x,2)),&
+rmse(2**dd),tole,norm_beta,prod_xty(ubound(x,2)),prod_xtx(ubound(x,2),ubound(x,2)),xnorm(ubound(x,2))
+integer csign(ubound(x,2),2**dd)
+
+m=ubound(x,1)
+n=ubound(x,2)
+
+! enumerate all coefficients signs
+csign=1
+do i=n,1,-1
+  if(i==n) then
+    csign(i,2)=-1
+  else
+    csign(i,2**(n-i)+1:2**(n-i+1))=-1
+    csign((i+1):n,2**(n-i)+1:2**(n-i+1))=csign((i+1):n,1:2**(n-i))
+  end if
+end do
+
+do i=1,n
+ xnorm(i)=sqrt(sum((x(:,i))**2))
+ xprime(:,i)=x(:,i)/xnorm(i)
+end do
+
+
+beta_old=0.0
+beta=0.0
+
+prod_xty=matmul(transpose(xprime),y)
+prod_xtx=matmul(transpose(xprime),xprime)
+
+do k=1,2**dd  ! all possibility of coefficients signs
+  do i=1,max_iter
+   do j=1,n
+         beta(j,k)=prod_xty(j)-sum(prod_xtx(j,:)*beta(:,k))+beta(j,k)
+         if(beta(j,k)*csign(j,k)<0.d0) beta(j,k)=0.d0   ! sign-constrained
+   end do
+     norm_beta=sqrt(sum((beta(:,k))**2))
+    if(norm_beta==0 .or.(sqrt(sum((beta(:,k)-beta_old(:,k))**2))/norm_beta)<tole .or. i==max_iter) then
+      exit
+    else
+      beta_old(:,k)=beta(:,k)
+    end if
+  end do
+  rmse(k)=sqrt(sum((y-matmul(xprime,beta(:,k)))**2)/m)
+  beta(:,k)=beta(:,k)/xnorm
+end do
+
+end subroutine
+
+
+subroutine coord_descent(x,y,max_iter,tole,intercept,beta,rmse) 
+! coordinate descent method to solve y=a+x*b 
+! input: matrix x,vector y, integer max_iter, real tole
+! output: intercept,beta,rmse
+
+implicit none
+real*8 x(:,:),y(ubound(x,1)),beta(ubound(x,2)),beta_old(ubound(x,2)),xprime(ubound(x,1),ubound(x,2)),&
+xdprime(ubound(x,1),ubound(x,2)),yprime(ubound(y,1)),intercept,rmse,tole,norm_beta,prod_xty(ubound(x,2)),&
+prod_xtx(ubound(x,2),ubound(x,2)),xpnorm(ubound(x,2)),ymean,xmean(ubound(x,2))
+integer m,n,i,j,k,max_iter
+
+! yprime= y-ymean
+! xprime=x-xmean
+! xdprime=xprime/xpnorm
+! intercept=ymean-sum(xmean*beta)
+! argmin||y-x*beta-intercept|| -> argmin||yprime-xprime*beta||
+! beta=beta/xpnorm
+
+m=ubound(x,1)
+n=ubound(x,2)
+do i=1,n
+ xmean(i)=sum(x(:,i))/m
+ xprime(:,i)=x(:,i)-xmean(i)
+ xpnorm(i)=sqrt(sum((xprime(:,i))**2))
+ xdprime(:,i)=xprime(:,i)/xpnorm(i)
+end do
+ymean=sum(y)/m
+yprime=y-ymean
+
+beta_old=0.0
+beta=0.0
+
+prod_xty=matmul(transpose(xdprime),yprime)
+prod_xtx=matmul(transpose(xdprime),xdprime)
+
+do i=1,max_iter
+ do j=1,n
+       beta(j)=prod_xty(j)-sum(prod_xtx(j,:)*beta)+beta(j)
+ end do
+   norm_beta=sqrt(sum(beta**2))
+  if(norm_beta==0 .or. (sqrt(sum((beta-beta_old)**2))/norm_beta)<tole .or. i==max_iter) then
+    exit
+  else
+    beta_old=beta
+  end if
+end do
+
+rmse=sqrt(sum((yprime-matmul(xdprime,beta))**2)/m)
+beta=beta/xpnorm
+intercept=ymean-sum(xmean*beta)
+
+end subroutine
+
+
+subroutine kfoldCV(x,y,random,fold,noise,CVrmse,CVmax)
+! k-fold cross validation
+! input: x, y, random, fold, noise
+! output: CVrmse, CVmax,
+
+integer ns,fold,random(:),mm1,mm2,mm3,mm4,i,j,k,kk,l
+real*8 x(:,:),y(:),beta(ubound(x,2)),intercept,rmse,CVse(fold),CVrmse,CVmax,pred(ubound(y,1)),noise(:)
+
+if(fold<2) then
+  print *, 'Error: The fold of CV must be >=2 !'; stop
+end if
+
+ns=ubound(y,1)
+
+CVmax=0
+CVrmse=0
+k=int(ns/fold)
+kk=mod(ns,fold)
+do l=1,fold
+   mm1=1  ! sample start
+   mm2=ns      ! sample end
+   mm3=(l-1)*k+min((l-1),kk)+1 ! test start
+   mm4=mm3+k-1+int(min(l,kk)/l)  ! test end
+   if(mm1==mm3) then
+      call orth_de(x([random(mm4+1:mm2)],:),y([random(mm4+1:mm2)])+noise([random(mm4+1:mm2)]),intercept,beta(:),rmse)
+   else if(mm4==mm2) then
+      call orth_de(x([random(mm1:mm3-1)],:),y([random(mm1:mm3-1)])+noise([random(mm1:mm3-1)]),intercept,beta(:),rmse)
+   else
+      call orth_de(x([random(mm1:mm3-1),random(mm4+1:mm2)],:),&
+                   y([random(mm1:mm3-1),random(mm4+1:mm2)])+noise([random(mm1:mm3-1),random(mm4+1:mm2)]),&
+                   intercept,beta(:),rmse)
+   end if
+   pred(mm3:mm4)=(intercept+matmul(x([random(mm3:mm4)],:),beta(:)))
+end do
+CVmax=maxval(abs(y([random])-pred))
+CVrmse=sqrt(sum((y([random])-pred)**2)/ns)  ! quadratic mean over samples
+end subroutine
+
+
+subroutine convex2D_hull(set,numb,hull)
+! calculate the convex hull for a given data set
+! input: set, a matrix N x 2
+! output: numb (number of vertices); hull, the vertices stored in clockwise direction
+real*8 set(:,:),hull(:,:),tmp,vjj(2),vij(2),vkj(2),normij,normkj
+integer numb,i,j,k,ntot,loc(1),nrecord
+logical used(ubound(set,1)),isvertex
+
+ntot=ubound(set,1)
+used=.false.
+numb=0
+
+! find the initial point with min(x) and min(y)
+! j can't be set 'used' as it is waiting to be found to close the hull
+loc=minloc(set(:,1)) 
+j=loc(1)
+do i=1,ntot  
+  if(i==j) cycle
+  if(abs(set(loc(1),1)-set(i,1))<=1d-10 .and. set(loc(1),2)>set(i,2)) loc(1)=i  
+end do
+j=loc(1)
+numb=1
+hull(1,:)=set(j,:)
+
+! find points at the same position with j
+do i=1,ntot
+   if(i==j) cycle
+   if(abs(set(j,1)-set(i,1))<=1d-10 .and. abs(set(j,2)-set(i,2))<=1d-10) then
+   numb=numb+1
+   hull(numb,:)=set(i,:)
+   used(i)=.true.
+  end if
+end do
+if(numb==ntot) return
+
+! start searching vertices
+! from known vector vjj, find the next vertex k by searching all i
+
+vjj=(/0.d0,-1.d0/)  ! initial vector pointing downward
+nrecord=0
+
+do while(any(used .eqv. .false.) )
+  nrecord=nrecord+1
+  if(nrecord>ntot) exit ! in case of endless loop
+  isvertex=.false.
+  vkj=vjj
+  normkj=-1.d0  ! normkj<0 if vkj==vjj; normkj>0 if vkj\=vjj
+
+  ! find the next vertex k to update j, vjj, numb, hull, used
+  do i=1,ntot
+
+    if(used(i) .or. i==j) cycle   ! loc(1) will be the last point to find
+
+    if(abs(set(j,1)-set(i,1))<=1d-10 .and. abs(set(j,2)-set(i,2))<=1d-10  ) then  ! all the points at the same position 
+       numb=numb+1
+       hull(numb,:)=set(i,:)
+       used(i)=.true.
+       cycle
+    end if
+
+    vij=set(i,:)-set(j,:)     
+    normij=sqrt(sum(vij**2))
+    vij=vij/normij
+    if(i==loc(1)) normij=1d10  ! ensure loc(1) is the last vertex 
+
+    tmp=vkj(1)*vij(2)-vij(1)*vkj(2)  ! cross product between vector vij and vkj
+
+    if (tmp>1d-10   .or. &  ! on the left side of vkj(vkj==vjj) or vij
+       (normkj>0.d0 .and. tmp>0.d0 .and. tmp<=1d-10 .and. sum(vij*vkj)>0.d0 ) .or. &  ! vkj/=vjj
+       (normkj<0.d0 .and. tmp>=0.d0 .and. tmp<=1d-10 .and. sum(vij*vjj)<0.d0 ) .or. &  ! vkj==vjj
+       (normkj>0.d0 .and. tmp==0.d0  .and. sum(vij*vkj)>0.d0 .and. normij<normkj ) ) then 
+          k=i
+          vkj=vij
+          normkj=normij  ! positive
+          isvertex=.true.
+    end if
+  end do
+
+  if(isvertex) then  ! assign the new vertex k to j
+    if(k==loc(1)) exit  ! loc(1) is already in the hull(1,:)
+     numb=numb+1
+     hull(numb,:)=set(k,:)
+     used(k)=.true.
+     vjj=-vkj   ! reverse the direction of vkj and assign to vjj
+     j=k   ! new j
+  end if
+
+end do
+
+end subroutine
+
+
+subroutine convex3D_hull(set,ntri,triangles)
+! find the convex hull by finding all the triangle facets
+! input: set, a matrix N x 3
+! output:  triangles, the ids of vertices forming traingles; ntri, number of triangles
+! NOTE: triangles is an allocatable array
+real*8  set(:,:),tmp,vij(2),vkj(2),normal_a(3),normal_b(3),vtmp(3,3),vedge(3),aa,bb,cc
+integer ntri,i,j,k,l,m,ntot,loc(2),nedge,iedge,leftright,comp(3),size_tri,size_edge,ntri_coplane,ntri_saved,nvert
+integer,allocatable:: triangles(:,:),edges(:,:),change(:,:),recorder(:),coplane(:,:),ver_index(:)
+logical overlap(ubound(set,1)),used,planar,goodedge
+
+ntot=ubound(set,1)
+overlap=.false.  ! points at the same position
+ntri=0
+nedge=0
+
+size_tri=10000
+size_edge=10000
+allocate(triangles(size_tri,3))  ! size to be adjusted
+allocate(edges(size_edge,2))
+
+! find the initial edge (the most left one projected on the x-y plane)
+
+!-------------------------------------
+! first point: min(x),min(y),min(z)
+loc(1:1)=minloc(set(:,1)) 
+k=loc(1)
+do i=1,ntot  
+  if(i==k) cycle
+  if(abs(set(loc(1),1)-set(i,1))<=1d-10 .and. set(loc(1),2)>set(i,2)) loc(1)=i  
+end do
+k=loc(1)
+do i=1,ntot
+ if(i==k) cycle
+ if(abs(set(loc(1),1)-set(i,1))<=1d-10 .and. abs(set(loc(1),2)-set(i,2))<=1d-10 .and. set(loc(1),3)>set(i,3)) loc(1)=i
+end do
+
+! second point for the initial edge
+vkj=(/0.d0,-1.d0/)
+vtmp(3,:)=(/0.d0,0.d0,-1.d0/)
+loc(2)=0
+do i=1,ntot   ! find a i so that vector i<-j is to the left side of k<-j
+  if(i==loc(1)) cycle 
+  vij=set(i,:2)-set(loc(1),:2)     
+  tmp=vkj(1)*vij(2)-vij(1)*vkj(2)  ! cross product vkj x vij
+
+  if (tmp>1d-10 ) then  ! on the left side of vkj or vij
+     loc(2)=i
+     vkj=vij
+     vtmp(3,:)=set(i,:3)-set(loc(1),:3) ! 3D form of vkj
+  else if(abs(tmp)<=1d-10) then  ! considering 3D
+    vtmp(1,:)=(/0.d0,0.d0,1.d0/)
+    vtmp(2,:)=set(i,:3)-set(loc(1),:3)
+    aa=sum(crosspro_abnormalized(vtmp(2,:),vtmp(1,:))*crosspro_abnormalized(vtmp(2,:),vtmp(3,:)))
+    bb=sum(crosspro_abnormalized(vtmp(2,:),vtmp(1,:))**2)  ! between vi and z
+    cc=sum(crosspro_abnormalized(vtmp(2,:),vtmp(3,:))**2)  ! between vi and vk
+    if(aa<0 .or. (bb==0.d0 .and. cc/=0.d0) .or. (cc==0.d0 .and. sum(vtmp(2,:)**2)>sum(vtmp(3,:)**2)) ) then
+        loc(2)=i
+        vkj=vij
+        vtmp(3,:)=set(i,:3)-set(loc(1),:3) ! 3D form of vkj
+    end if
+  end if
+end do
+if(loc(2)==0) return  ! all points are at the same one point
+!-------------------------------------------------------
+
+nedge=1
+ntri=0
+if(loc(1)<loc(2)) then ! let edges(i,1) always < edges(i,2)
+  edges(1,:)=(/loc(1),loc(2)/)
+else
+  edges(1,:)=(/loc(2),loc(1)/)
+end if
+
+! find all points at the same position with the initial edge points
+do i=1,ntot
+   if(i==edges(1,1) .or. i==edges(1,2)) cycle
+   if( sqrt(sum((set(edges(1,1),:)-set(i,:))**2))<1d-10 .or. sqrt(sum((set(edges(1,2),:)-set(i,:))**2))<1d-10) &
+      overlap(i)=.true.
+end do
+
+! With the initial edge and plane, find all the triangles
+leftright=1
+iedge=1
+do while(iedge<=nedge)
+
+  ! set a initial plane
+   vedge=set(edges(iedge,2),:)-set(edges(iedge,1),:)
+   goodedge=.true.
+
+   do i=1,ntot
+     if(i==edges(iedge,1) .or. i==edges(iedge,2) .or. overlap(i) ) cycle
+     vtmp(1,:)=set(i,:)-set(edges(iedge,1),:)
+     vtmp(2,:)=crosspro_abnormalized(vedge,vtmp(1,:)) ! normal
+     if(sqrt(sum(vtmp(2,:)**2))>1d-10) then
+       normal_a=vtmp(2,:)      ! the normal vector of the initial plane
+       k=i
+       exit
+     end if
+     if(i==ntot) return  ! all points are on a line
+   end do
+
+ ! check if all the points are in one plane
+   if(iedge==1) then
+     planar=.true.
+     do i=1,ntot
+         if( sum((set(i,:)-set(edges(iedge,1),:))*normal_a) /=0.d0) then
+           planar=.false.
+           exit
+         end if 
+     end do
+     if(planar) return
+   end if
+
+ !-------------------------------------
+ ! For each edge, find the leftmost and rightmost facet!
+  do i=1,ntot
+    if(overlap(i) .or. k==i .or. edges(iedge,1)==i .or. edges(iedge,2)==i) cycle 
+    vtmp(1,:)=set(i,:)-set(edges(iedge,1),:) 
+    vtmp(2,:)=crosspro_abnormalized(vedge,vtmp(1,:)) ! normal of the considered plane
+    normal_b=vtmp(2,:)
+    if(sqrt(sum(normal_b**2))<1d-10) cycle  ! point i is on the edge-line
+    vtmp(3,:)=crosspro_abnormalized(normal_a,normal_b) ! cross product between the normal of two plane
+    tmp=sum(vtmp(3,:)*vedge) ! to the left or right
+
+    if(abs(tmp)>1d-10) then  !/=0
+       if((leftright>0 .and. tmp>0.d0) .or. (leftright<0 .and. tmp<0.d0)) then
+          k=i
+          normal_a=normal_b
+       end if
+    else if(.not. intriangle(set(i,:),set(edges(iedge,1),:),set(edges(iedge,2),:),set(k,:)) ) then
+       if(leftright>0) then
+         vtmp(1,:)=set(i,:)-set(edges(iedge,1),:)
+         vtmp(2,:)=set(k,:)-set(edges(iedge,1),:)
+         aa=sum(crosspro_abnormalized(vedge,vtmp(2,:))*crosspro_abnormalized(vtmp(2,:),vtmp(1,:))) !edge x vk,vk x vi
+         bb=sum(crosspro_abnormalized(vedge,vtmp(2,:))*crosspro_abnormalized(vedge,vtmp(1,:))) 
+       else ! reverse the edge direction
+         vtmp(1,:)=set(i,:)-set(edges(iedge,2),:)
+         vtmp(2,:)=set(k,:)-set(edges(iedge,2),:)
+         aa=sum(crosspro_abnormalized(-vedge,vtmp(2,:))*crosspro_abnormalized(vtmp(2,:),vtmp(1,:)))!edge x vk,vk x vi
+         bb=sum(crosspro_abnormalized(-vedge,vtmp(2,:))*crosspro_abnormalized(-vedge,vtmp(1,:)))
+       end if
+
+       if( bb <-1d-10) then   ! the edge is not really edge
+          goodedge=.false.
+          exit
+       end if
+  
+       if( aa >=0.d0) then ! the edge repel the 'i' to the other edge
+          k=i
+          normal_a=normal_b
+       end if
+
+    end if
+  end do
+
+ !---------------------------------------
+
+  if(goodedge) then
+      ! add new triangle
+       comp(1)=min(edges(iedge,1),edges(iedge,2),k)
+       comp(3)=max(edges(iedge,1),edges(iedge,2),k)
+       if(k/=comp(1) .and. k/=(comp(3))) then
+           comp(2)=k
+       elseif(edges(iedge,1)/=comp(1) .and. edges(iedge,1)/=comp(3)) then
+           comp(2)=edges(iedge,1)
+       else
+           comp(2)=edges(iedge,2)
+       end if  
+
+       used=.false.
+       do i=1,ntri
+         if(all(triangles(i,:) == comp(:))) then
+            used=.true.
+            exit
+         end if
+       end do
+
+       if(.not. used ) then
+          ntri=ntri+1
+          triangles(ntri,:3)=comp(:3)
+       end if
+       if(ntri==size_tri) then
+         allocate(change(size_tri,3))
+         change=triangles
+         deallocate(triangles)
+         size_tri=size_tri*2
+         allocate(triangles(size_tri,3))
+         triangles(:ntri,:)=change
+         deallocate(change)
+       end if
+
+       ! find overlap points
+       do i=1,ntot
+          if(i==k .or. overlap(i) ) cycle
+          if( (abs(set(k,1)-set(i,1))<=1d-10 .and. abs(set(k,2)-set(i,2))<=1d-10 .and. &
+              abs(set(k,3)-set(i,3))<=1d-10 )) overlap(i)=.true.
+       end do
+
+       do j=1,2
+           comp(1)=min(edges(iedge,j),k)
+           comp(2)=max(edges(iedge,j),k)
+           used=.false.
+           do i=1,nedge
+              if(all(edges(i,:)==comp(:2))) then
+                  used=.true.
+                  exit
+              end if
+           end do
+           if(.not. used) then
+              nedge=nedge+1
+              edges(nedge,:)=comp(:2)
+           end if
+
+          if(nedge==size_edge) then
+            allocate(change(size_edge,2))
+            change=edges
+            deallocate(edges)
+            size_edge=size_edge*2
+            allocate(edges(size_edge,2))
+            edges(:nedge,:)=change
+            deallocate(change)
+          end if
+
+       end do
+   end if
+
+   leftright=-leftright   ! each edge is to be used twice.
+   if(leftright>0) iedge=iedge+1
+
+end do
+
+
+! rearrangement of the triangles in the same plane
+allocate(change(5*ntri,3))
+allocate(coplane(5*ntri,3))
+allocate(recorder(ntri))
+allocate(ver_index(ntri*3))
+recorder=1  ! 1: unaccessed; 0: accessed
+ntri_saved=0  ! number of rearranged triangles
+ntri_coplane=0  ! number of triangles in one plane
+
+do i=1,ntri
+  if(recorder(i)==0) cycle
+  recorder(i)=0
+  ntri_coplane=1
+  coplane(ntri_coplane,:)=triangles(i,:)
+  vtmp(1,:)=set(triangles(i,2),:)-set(triangles(i,1),:)
+  vtmp(2,:)=set(triangles(i,3),:)-set(triangles(i,1),:)
+  normal_a=crosspro_abnormalized(vtmp(1,:),vtmp(2,:)) ! the reference normal
+  do j=i+1,ntri  ! find the coplane triangles if there are
+     if(recorder(j)==0) cycle
+     vtmp(1,:)=set(triangles(j,2),:)-set(triangles(j,1),:)
+     vtmp(2,:)=set(triangles(j,3),:)-set(triangles(j,1),:)
+     normal_b=crosspro_abnormalized(vtmp(1,:),vtmp(2,:)) !the normal of candidate triangle
+     if(sqrt(sum(crosspro_abnormalized(normal_a,normal_b)**2))<=1d-10 .and. &
+        dispp(set(triangles(j,1),:),set(triangles(i,1),:),set(triangles(i,2),:),set(triangles(i,3),:))<1d-6) then 
+        ntri_coplane=ntri_coplane+1
+        coplane(ntri_coplane,:)=triangles(j,:)
+        recorder(j)=0
+     end if
+  end do
+ 
+  ver_index=0
+  ver_index(1:3)=coplane(1,:) 
+  nvert=3 
+  do j=2,ntri_coplane  ! find the vertice of all the coplane triangles
+    if(all(coplane(j,1)/=ver_index(:nvert))) then
+       nvert=nvert+1; ver_index(nvert)=coplane(j,1)
+    end if
+    if(all(coplane(j,2)/=ver_index(:nvert))) then
+       nvert=nvert+1; ver_index(nvert)=coplane(j,2)
+    end if
+    if(all(coplane(j,3)/=ver_index(:nvert))) then
+       nvert=nvert+1; ver_index(nvert)=coplane(j,3)
+    end if
+  end do
+
+  if(nvert>3) then
+  ! direction
+    vtmp(1,:)=set(ver_index(2),:)-set(ver_index(1),:)
+    vtmp(2,:)=set(ver_index(3),:)-set(ver_index(1),:)
+    normal_a=crosspro_abnormalized(vtmp(1,:),vtmp(2,:))  
+
+  ! find the starting point
+    k=1
+    do j=3,nvert
+       vtmp(1,:)=set(ver_index(2),:)-set(ver_index(k),:)
+       vtmp(2,:)=set(ver_index(j),:)-set(ver_index(k),:)
+       vtmp(3,:)=crosspro_abnormalized(vtmp(1,:),vtmp(2,:))
+       if(sum(vtmp(3,:)**2)==0.d0 .and. sum(vtmp(1,:)*vtmp(2,:))<0.d0) k=j
+    end do
+    ! switch points 1 and k
+    j=ver_index(1)
+    ver_index(1)=k
+    ver_index(k)=j
+
+   ! find the second point
+    k=2
+    do j=3,nvert  !find the first most left edge: 1-k
+       vtmp(1,:)=set(ver_index(k),:)-set(ver_index(1),:)
+       vtmp(2,:)=set(ver_index(j),:)-set(ver_index(1),:)
+       normal_b=crosspro_abnormalized(vtmp(1,:),vtmp(2,:))
+       aa=sum(normal_b*normal_a)
+       if( aa>0.d0 .or. (aa==0.d0 .and.  sum(vtmp(2,:)**2)>sum(vtmp(1,:)**2) ) ) k=j
+    end do
+    
+    ! find all the triangles
+    m=2  ! point 1 and k
+    vtmp(1,:)=set(ver_index(1),:)-set(ver_index(k),:) 
+    do while(m<nvert)
+       l=0
+       do j=2,nvert  ! find triangle 1-l-k
+         if(j==k .or. ver_index(j)==0) cycle ! skip the evaluated points
+         vtmp(2,:)=set(ver_index(j),:)-set(ver_index(k),:)
+         normal_b=crosspro_abnormalized(vtmp(1,:),vtmp(2,:))
+         aa=sum(normal_b*normal_a)
+         if( aa> 0.d0) then
+           l=j
+           vtmp(1,:)=set(ver_index(l),:)-set(ver_index(k),:)
+         elseif(aa==0.d0) then   ! linear
+           if(sum(vtmp(2,:)**2)>sum(vtmp(1,:)**2)) then
+             l=j
+             vtmp(1,:)=set(ver_index(l),:)-set(ver_index(k),:)
+           else
+             ver_index(j)=0    ! remove points inside a line segment
+           end if
+         end if
+       end do
+
+       if(l/=0) then
+         vtmp(2,:)=crosspro_abnormalized(vtmp(1,:),set(ver_index(l),:)-set(ver_index(1),:))
+         if(sqrt(sum(vtmp(2,:)**2))>1d-10) then  ! the three points are not on the same line
+            ntri_saved=ntri_saved+1 ! every loop there must be a new point found
+            change(ntri_saved,:)=(/ver_index(1),ver_index(k),ver_index(l)/)
+         end if
+          ver_index(k)=0
+          vtmp(1,:)=-vtmp(1,:)  ! reverse the direction for the next loop
+          k=l
+       end if
+
+       m=m+1  ! every loop find one vertex,totally nvert-2 points
+    end do
+
+  else
+     ntri_saved=ntri_saved+1
+     change(ntri_saved,:)=ver_index(1:3)
+  end if    
+
+end do
+
+ntri=ntri_saved
+triangles(:ntri_saved,:)=change(:ntri_saved,:)
+
+deallocate(change)
+deallocate(recorder)
+deallocate(coplane)
+deallocate(ver_index)
+deallocate(edges)
+
+end subroutine
+
+
+subroutine convex2d_overlap(set1,set2,bwidth,numb,area)
+! counting the number of data and area in the overlapped region between two convex hulls
+! input: data set 1, data set 2,bwidth(boundary tolerance)
+! output: number of data in the overlapped region, area of the overlapped region
+! segment intersection: (http://dec3.jlu.edu.cn/webcourse/t000096/graphics/chapter5/01_1.html)
+! if two convex domain has more than two intersection points, the two domains are considered totally overlap !
+
+integer i,j,i2,j2,k,numb,nh1,nh2,ns1,ns2,n_intersect
+real*8 set1(:,:),set2(:,:),area,hull1(ubound(set1,1),2),hull2(ubound(set2,1),2),bwidth,norm1,norm2,vunit1(2),vunit2(2),&
+set3(10*(ubound(set1,1)+ubound(set2,1)),2),tmp,segp(4,2),delta,lambda,mu,xa,xb,ya,yb,xc,xd,yc,yd
+logical inside,polygon1,polygon2
+
+call convex2D_hull(set1,nh1,hull1)  ! convex hull of data set 1 
+call convex2D_hull(set2,nh2,hull2)  ! convex hull of data set 2
+ns1=ubound(set1,1)
+ns2=ubound(set2,1)
+numb=0
+area=0.d0
+polygon1=.true.
+polygon2=.true.
+
+if(convex2d_area(hull1(:nh1,:))<1d-10) polygon1=.false.
+if(convex2d_area(hull2(:nh2,:))<1d-10) polygon2=.false.
+
+! both the sets are not polygons
+if((ns1>2 .and. (.not. polygon1)) .or. (ns2>2 .and. (.not. polygon2)) ) then   ! two lines
+  numb=ns1+ns2      ! set to max since this is not wanted.
+  area=0.d0
+else
+  ! set1 is polygon, count the number of set2 data in hull1
+  if(polygon1) then
+  do i=1,ns2
+    ! check if data from set2 is inside hull1
+    inside=.true.
+    do j=1,nh1
+       k=j+1
+       if(j==nh1) k=1
+
+       norm1=sqrt(sum((hull1(k,:)-hull1(j,:))**2))
+       if(norm1<1d-10) cycle
+       vunit1=(hull1(k,:)-hull1(j,:))/norm1
+       norm2=sqrt(sum((set2(i,:)-hull1(j,:))**2))
+       if(norm2<1d-10) cycle
+       vunit2=(set2(i,:)-hull1(j,:))/norm2
+       tmp=vunit1(1)*vunit2(2)-vunit2(1)*vunit1(2)  ! cross product
+
+       if(tmp>0.d0) then  
+          if(bwidth>0.d0) then
+            if(convex2d_dist(set2(i:i,:2),hull1(:nh1,:2))>bwidth) then  ! boundary tolerance
+             inside=.false.
+             exit  
+            end if
+          else
+             inside=.false.
+             exit
+          end if
+      end if
+    end do
+    if(inside) then
+       numb=numb+1
+       set3(numb,:)=set2(i,:)
+    end if
+  end do
+  end if
+
+  ! set1 data in hull2
+  if(polygon2) then
+  do i=1,ns1
+    ! check if set1 data is inside hull2
+    inside=.true.
+    do j=1,nh2
+       k=j+1
+       if(j==nh2) k=1
+
+       norm1=sqrt(sum((hull2(k,:)-hull2(j,:))**2))
+       if(norm1<1d-10) cycle
+       vunit1=(hull2(k,:)-hull2(j,:))/norm1
+       norm2=sqrt(sum((set1(i,:)-hull2(j,:))**2))
+       if(norm2<1d-10) cycle
+       vunit2=(set1(i,:)-hull2(j,:))/norm2
+       tmp=vunit1(1)*vunit2(2)-vunit2(1)*vunit1(2)  ! cross product
+
+
+       if(tmp>0.d0) then  ! point on the left side of the edge 
+          if(bwidth>0.d0) then
+             if(convex2d_dist(set1(i:i,:2),hull2(:nh2,:2))>bwidth) then  ! boundary tolerance
+                inside=.false.
+                exit
+             end if
+          else
+                inside=.false.
+                exit
+          end if              
+       end if
+    end do
+    if(inside) then
+       numb=numb+1
+       set3(numb,:)=set1(i,:)
+    end if
+  end do
+  end if
+
+  ! if has common region, solve edge intersection
+  k=numb
+  n_intersect=0
+   do i=1,nh1
+        i2=i+1
+        if(i==nh1) i2=1
+        do j=1,nh2
+          j2=j+1
+          if(j==nh2) j2=1
+          xa=hull1(i,1); xb=hull1(i2,1); ya=hull1(i,2); yb=hull1(i2,2)
+          xc=hull2(j,1); xd=hull2(j2,1); yc=hull2(j,2); yd=hull2(j2,2)
+          delta=-(xb-xa)*(yd-yc)+(yb-ya)*(xd-xc)
+          if(delta/=0.d0) then   ! intersect
+             lambda=(-(xc-xa)*(yd-yc)+(yc-ya)*(xd-xc))/delta
+             mu=((xb-xa)*(yc-ya)-(yb-ya)*(xc-xa))/delta
+             if(lambda>=0.d0 .and. lambda<=1.d0 .and. mu>=0.d0 .and. mu<=1.d0) then
+                k=k+1
+                set3(k,:)=(/xa+lambda*(xb-xa),ya+lambda*(yb-ya)/)
+             end if
+             if(lambda>0 .and. lambda<1.d0 .and. mu>0 .and. mu<1.d0) n_intersect=n_intersect+1
+          end if
+        end do
+   end do
+   if(k>=3) then
+      area=convex2d_area(set3(:k,:))   ! there must be intersection if there is overlap
+   else if (k==0) then
+      area= -convex2d_dist(hull1(:nh1,:),hull2(:nh2,:)) ! set a negative value
+   end if
+   if(n_intersect>2) numb=ns1+ns2  ! presumed totally overlap if there are more than 2 intersect points
+
+end if
+end subroutine
+
+
+function convex2d_area(set)
+! calculate the area of a 2d convex hull
+! input: set, a matrix
+! output: area
+real*8 va(2),vb(2),area,convex2d_area,set(:,:),hull(ubound(set,1),ubound(set,2))
+integer i,j,k,nh
+
+call convex2D_hull(set,nh,hull)
+
+area=0.d0
+do i=3,nh   ! number of vertices must be >=3
+  j=i-1
+  va=(/hull(j,1)-hull(1,1),hull(j,2)-hull(1,2)/)
+  vb=(/hull(i,1)-hull(1,1),hull(i,2)-hull(1,2)/)
+  area=area+0.5d0*abs(va(1)*vb(2)-va(2)*vb(1))
+end do
+convex2d_area=area
+if(area<1d-10) area=0.d0
+
+end function
+
+
+function convex2d_in(set1,set2,bwidth)
+! check how many points of set2 are inside the hull of set1
+! input: the data set forming the hull; the point; and boundary tolerence
+! output: .false. or .true.
+
+integer i,j,k,nh,np,convex2d_in
+real*8 set1(:,:),set2(:,:),hull(ubound(set1,1),ubound(set1,2)),tmp,bwidth,norm1,norm2,vunit1(2),vunit2(2)
+
+call convex2D_hull(set1,nh,hull)
+convex2d_in=0
+
+if(nh==1) return
+np=ubound(set2,1)
+
+
+do i=1,np
+    convex2d_in=convex2d_in+1
+    do j=1,nh
+       k=j+1
+       if(j==nh) k=1
+    
+        norm1=sqrt(sum((hull(k,:)-hull(j,:))**2))
+        if(norm1<1d-10) cycle
+        vunit1=(hull(k,:)-hull(j,:))/norm1
+        norm2=sqrt(sum((set2(i,:)-hull(j,:))**2))
+        if(norm2<1d-10) cycle
+        vunit2=(set2(i,:)-hull(j,:))/norm2
+        tmp=vunit1(1)*vunit2(2)-vunit2(1)*vunit1(2)  ! cross product
+    
+       if(tmp>0.d0) then 
+            if(bwidth>0.d0) then
+               if(convex2d_dist(set2(i:i,:2),hull(:nh,:2))>bwidth) then  ! boundary tolerance
+                  convex2d_in=convex2d_in-1
+                  exit
+               end if
+            else
+                  convex2d_in=convex2d_in-1
+                  exit             
+            end if  
+       end if
+    end do
+end do
+
+end function
+
+function convex2d_dist(set1,set2)
+! calculate the distance between two data sets (convex hulls)
+
+real*8 set1(:,:),set2(:,:),convex2d_dist,hull1(ubound(set1,1),2),hull2(ubound(set2,1),2),&
+p1(2),p2(2),p3(2),va(2),vb(2),vv(2),dot,len_sq,param,dist
+integer nh1,nh2,i,j,k
+
+if(ubound(set1,1)==1) then
+  nh1=1
+  hull1=set1
+else
+  call convex2D_hull(set1,nh1,hull1)
+end if
+
+if(ubound(set2,1)==1) then
+  nh2=1
+  hull2=set2
+else
+  call convex2D_hull(set2,nh2,hull2)
+end if
+
+convex2d_dist=1d10
+
+do i=1,nh1
+   if(nh1==1) exit
+   j=i+1
+   if(i==nh1) j=1
+   p1=hull1(i,:)
+   p2=hull1(j,:)
+   va=p2-p1
+   len_sq=sum(va**2)
+   do k=1,nh2     ! distance of data from set2 to hull1
+      p3=hull2(k,:)
+      vb=p3-p1
+      dot=sum(va*vb)
+      if(len_sq/=0.d0) then
+         param=dot/len_sq
+      else
+         param=-1
+      end if
+
+      if(param<0.d0) then
+         vv=p1
+      else if (param>1.d0) then
+         vv=p2
+      else
+        vv(1)=p1(1)+param*va(1)
+        vv(2)=p1(2)+param*va(2)
+      end if
+      dist=sqrt(sum((p3-vv)**2))
+      if(dist<convex2d_dist) convex2d_dist=dist
+   end do
+end do
+
+do i=1,nh2
+   if(nh2==1) exit
+   j=i+1
+   if(i==nh2) j=1
+   p1=hull2(i,:)
+   p2=hull2(j,:)
+   va=p2-p1
+   len_sq=sum(va**2)
+   do k=1,nh1
+      p3=hull1(k,:)
+      vb=p3-p1
+      dot=sum(va*vb)
+      if(len_sq/=0.d0) then
+         param=dot/len_sq
+      else
+         param=-1
+      end if
+
+      if(param<0.d0) then
+         vv=p1
+      else if (param>1.d0) then
+         vv=p2
+      else
+        vv(1)=p1(1)+param*va(1)
+        vv(2)=p1(2)+param*va(2)
+      end if
+      dist=sqrt(sum((p3-vv)**2))
+      if(dist<convex2d_dist) convex2d_dist=dist
+   end do
+end do
+end function
+
+
+subroutine convex1d_overlap(set1,set2,bwidth,numb,length)
+! input: set1, set2, bwidth
+! output: number of data, and the length, in the overlapped region
+real*8 set1(:),set2(:),length,mini,maxi,bwidth
+integer numb,i,ns1,ns2
+ns1=ubound(set1,1)
+ns2=ubound(set2,1)
+
+numb=0
+length=0.d0
+
+mini=minval(set2)-bwidth
+maxi=maxval(set2)+bwidth
+do i=1,ns1
+ if(set1(i)>=mini .and. set1(i)<=maxi) numb=numb+1
+end do
+
+mini=minval(set1)-bwidth
+maxi=maxval(set1)+bwidth
+do i=1,ns2
+ if(set2(i)>=mini .and. set2(i)<=maxi) numb=numb+1
+end do
+
+length=(min(maxval(set1),maxval(set2))-max(minval(set1),minval(set2)))
+! if overlapped when length>0, and separated distance when length<0.
+
+end subroutine
+
+
+function convex1d_in(set1,set2,bwidth)
+! check how many data points of set2 are inside the segment by set1
+real*8 set1(:),set2(:),mini,maxi,bwidth
+integer i,convex1d_in
+ns1=ubound(set1,1)
+ns2=ubound(set2,1)
+
+convex1d_in=0
+
+mini=minval(set1)-bwidth
+maxi=maxval(set1)+bwidth
+do i=1,ubound(set2,1)
+ if(set2(i)>=mini .and. set2(i)<=maxi) convex1d_in=convex1d_in+1
+end do
+
+end function
+
+
+function convex3d_in(set1,set2,bwidth,ntri,triangles)
+! check how many points of set2 are inside a 3d convex hull of set1
+! input: the full data set1,set2, the boundary tolerence, and the hull-triangles
+
+integer iii,i,j,k,ntri,ninter,triangles(:,:),convex3d_in
+real*8 pref(3),set1(:,:),set2(:,:),bwidth,pproj(3),pinter(3),v1(3),v2(3),v3(3),&
+       normal(3),ddd,ttt,area,dist(3),vtmp(3,3)
+real*8,allocatable:: inter_all(:,:)
+logical inside
+
+convex3d_in=0
+allocate(inter_all(ntri,3))
+
+do iii=1,ubound(set2,1)
+     ninter=0
+     pref=(/set2(iii,1)+1.0,set2(iii,2)+1.0,set2(iii,3)+1.0/) ! point-pref(arbitrary) is the line
+     inside=.false.
+     do i=1,ntri
+       v1=set1(triangles(i,2),:)-set1(triangles(i,1),:)
+       v2=set1(triangles(i,3),:)-set1(triangles(i,1),:)
+       v3=set1(triangles(i,3),:)-set1(triangles(i,2),:)
+       normal=crosspro(v1,v2)  ! normal of the triangle plane
+       ddd=-sum(normal*set1(triangles(i,1),:)) ! the d of plane: ax+by+cz+d=0
+       ttt=-(sum(normal*set2(iii,:))+ddd)/sum(normal**2)  ! the t of line equation
+       pproj=set2(iii,:)+normal*ttt  ! projection of the point on the triangle plane
+     
+       ! measure the distance of the point from the triangle
+       if(sqrt(sum((set2(iii,:)-pproj)**2))<=bwidth) then
+          if(intriangle(pproj,set1(triangles(i,1),:),set1(triangles(i,2),:),set1(triangles(i,3),:))) then !projection inside
+             convex3d_in=convex3d_in+1
+             inside=.true.
+          else
+             vtmp(1,:)=set2(iii,:)-set1(triangles(i,1),:)
+             vtmp(2,:)=set2(iii,:)-set1(triangles(i,2),:)
+             vtmp(3,:)=set2(iii,:)-set1(triangles(i,3),:)
+             if(sum(vtmp(1,:)*v1) <=0.d0 .or. &
+                sum(vtmp(2,:)*(-v1)) <=0.d0 ) then 
+                 dist(1)=min(sqrt(sum((vtmp(1,:))**2)),sqrt(sum((vtmp(2,:))**2)))
+             else ! only acute angles in the triangle
+                 area=sqrt(sum((crosspro(vtmp(1,:),v1))**2)) 
+                 dist(1)=area/sqrt(sum(v1**2))
+             end if
+             if(sum(vtmp(1,:)*v2) <=0.d0 .or. &
+                sum(vtmp(3,:)*(-v2)) <=0.d0 ) then
+                 dist(2)=min(sqrt(sum((vtmp(1,:))**2)),sqrt(sum((vtmp(3,:))**2)))
+             else ! only acute angles in the triangle
+                 area=sqrt(sum((crosspro(vtmp(1,:),v2))**2))
+                 dist(2)=area/sqrt(sum(v2**2))
+             end if
+             if(sum(vtmp(3,:)*(-v3)) <=0.d0 .or. &
+                sum(vtmp(2,:)*v3) <=0.d0 ) then
+                 dist(3)=min(sqrt(sum((vtmp(3,:))**2)),sqrt(sum((vtmp(2,:))**2)))
+             else ! only acute angles in the triangle
+                 area=sqrt(sum((crosspro(vtmp(3,:),(-v3)))**2))
+                 dist(3)=area/sqrt(sum(v3**2))
+             end if
+             if(min(dist(1),dist(2),dist(3))<=bwidth) then
+                 convex3d_in=convex3d_in+1
+                 inside=.true.
+              end if
+          end if
+       end if
+     
+       if(inside) exit
+
+       ! if not in the triangle, check intersections
+       if(sum((set2(iii,:)-pref)*normal)/=0.d0 ) then ! line-plane is not parallel
+         pinter=interlp(set1(triangles(i,1),:),set1(triangles(i,2),:),set1(triangles(i,3),:),set2(iii,:),pref)
+         if(intriangle(pinter,set1(triangles(i,1),:),set1(triangles(i,2),:),set1(triangles(i,3),:))) then
+             ninter=ninter+1
+             inter_all(ninter,:)=pinter
+         end if
+       end if
+     end do
+     
+     if(inside) cycle
+
+     do i=1,ninter-1
+       do j=i+1,ninter
+         if(sqrt(sum((inter_all(i,:)-inter_all(j,:))**2))<1d-10) cycle ! the two points are at the same position
+         v1=inter_all(i,:)-set2(iii,:)
+         v2=inter_all(j,:)-set2(iii,:)
+         if(sum(v1*v2)<0.d0) then  ! if v1 and v2 point to different direction, the point is inside
+            convex3d_in=convex3d_in+1     ! otherwise the point is outside
+            inside=.true.
+         end if
+         if(inside) exit
+       end do
+       if(inside) exit
+     end do
+
+end do
+
+deallocate(inter_all)
+
+end function
+
+subroutine convex3d_overlap(set1,set2,bwidth,numb)
+! counting the number of overlap data
+! input: data set 1, data set 2,bwidth(boundary tolerance)
+! output: number of data in the overlapped region
+
+integer no1,no2,i,j,k,nset1,nset2,ntri1,ntri2
+integer,allocatable:: triangles(:,:)
+real*8 set1(:,:),set2(:,:),bwidth
+logical inside
+
+nset1=ubound(set1,1)
+nset2=ubound(set2,1)
+
+call convex3D_hull(set1,ntri1,triangles)
+if(ntri1==0) then  ! <3D are not prefered
+   numb=nset1+nset2
+   return
+end if
+
+! counts data of set2 in set1
+no1=0
+no1=no1+convex3d_in(set1,set2,bwidth,ntri1,triangles)
+deallocate(triangles)  ! allocated in the convex3D_hull
+
+
+call convex3D_hull(set2,ntri2,triangles)
+if(ntri2==0) then  ! <3D are not prefered
+   numb=nset1+nset2
+   return
+end if
+
+! counts data of set1 in set2
+no2=0
+no2=no2+convex3d_in(set2,set1,bwidth,ntri2,triangles)
+deallocate(triangles)
+
+numb=no1+no2
+
+end subroutine
+
+
+end module
+
+
diff --git a/src/var_global.f90 b/src/var_global.f90
new file mode 100644
index 0000000..3bff4d3
--- /dev/null
+++ b/src/var_global.f90
@@ -0,0 +1,46 @@
+! Licensed under the Apache License, Version 2.0 (the "License");
+! you may not use this file except in compliance with the License.
+! You may obtain a copy of the License at
+!
+!     http://www.apache.org/licenses/LICENSE-2.0
+!
+! Unless required by applicable law or agreed to in writing, software
+! distributed under the License is distributed on an "AS IS" BASIS,
+! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+! See the License for the specific language governing permissions and
+! limitations under the License.
+
+
+module var_global
+! global variabls used by multiple modules
+use mpi
+
+implicit none
+
+type run_time ! parameters to show the program execution time
+  real*8 sFCDI,eFCDI,sFC,eFC,sDI,eDI
+end type
+
+type LASSOpara  ! parameters for LASSO
+  integer max_iter,nlambda,dens,nl1l0
+  real*8  tole,minrmse,elastic
+  logical warm_start,weighted
+end type LASSOpara
+
+type(run_time) mytime
+type(LASSOpara) L1para
+
+integer   nsf,ntask,nunit,fcomplexity,rung,str_len,nf_DI,nf_L0,ptype,Smaxlen,&
+          desc_dim,nmodel,iFCDI,fileunit,task_weighting,npoint,restart,fstore
+real*8    fmax_min,fmax_max,bwidth,PI
+parameter (str_len=150,PI=3.14159265d0,Smaxlen=60)
+character ops(20)*200,method_so*10,metric*10
+integer*8 nf_sis(10000),nf_sis_avai(10000)
+logical   fit_intercept,scmt
+integer,allocatable:: nsample(:),ngroup(:,:),isconvex(:,:)
+real*8,allocatable:: target_y(:),pfdata(:,:),res(:),feature_units(:,:),ypred(:)
+character(len=30),allocatable:: pfname(:)
+integer   mpierr,mpirank,mpisize,status(MPI_STATUS_SIZE)
+
+end module 
+