Merge pull request #127 from hikettei/refactor

[Add] Runtime Checking of Total Elements of LazyReshape
hikettei · Oct 1, 2023 · ac3f070 · ac3f070
2 parents c69d1bd + d5cedbb
commit ac3f070
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 19 deletions.
diff --git a/source/base-impl/fundamental.lisp b/source/base-impl/fundamental.lisp
@@ -274,12 +274,20 @@ Tips: If a function is passed as the first element of `subscript`, the subscript
 	  :documentation "")
   (setf (ignore-shape-error self) t))
 
-(define-impl (ReshapeTensorNode :device t)
-	     :save-for-backward (t) ;; =T is necessary not to delete MoveTensorNode.
-	     :forward ((self x y)
-		       `(progn
-			  (setf (tensor-vec ,y) (tensor-vec ,x))
-			  ,y)))
+(define-impl-op (ReshapeTensorNode :device t)
+		:forward ((self x y)
+			  ;; Reshaping is the operation where:
+			  ;;  The order of storage vec is the same.
+			  ;;  But other factors (e.g.: Shaping, Strides)
+			  ;;  Has Changed.
+
+			  ;; [TODO] Detect This Error Before Execution.
+			  (assert (= (total x) (total y))
+				  nil
+				  "ReshapeTensorNode: Attempted to move x to y but failed because the total sizes considering dynamically shape do not match:
+~a and ~a" x y)
+			  (setf (tensor-vec y) (tensor-vec x))
+			  y))
 
 ;; ===============================================================
 ;; Reshaping APIs
@@ -344,7 +352,7 @@ Note: If the first element of `shapes` is a function, `shapes` are overwritten w
   (declare (type AbstractTensor tensor))
 
   (let* ((shapes (if (functionp (car shapes))
-		     (funcall (car shapes) tensor)
+		     (funcall   (car shapes) tensor)
 		     shapes))
 	 (shapes (parse-reshape-args (shape tensor) shapes))
 	 (result (make-input shapes nil
@@ -356,10 +364,12 @@ Note: If the first element of `shapes` is a function, `shapes` are overwritten w
       (assert (= (apply #'* (shape tensor))
 		 (apply #'* shapes))
 	      nil
-	      "Reshaping failed because the total size do not match."))
+	      "Reshaping failed because the total sizes do not match."))
     ;; (!view tensor `(2 4) `(2 4)) -> Copy
     ;; (!view tensor  0 t t t)
-    (let ((out (forward (ReshapeTensorNode (shape tensor) shapes) (->contiguous tensor) result)))
+    (let ((out (forward (ReshapeTensorNode (shape tensor) shapes)
+			(->contiguous tensor)
+			result)))
       out)))
 
 ;; !squeeze/!unsqueeze
@@ -383,6 +393,7 @@ Note: If the first element of `shapes` is a function, `shapes` are overwritten w
 equivalent to the `(!reshape tensor t)`
 "
     (!reshape tensor t)))
+
 (declaim (ftype (function (AbstractTensor fixnum &key (:at fixnum)) AbstractTensor) !rankup))
 (defun !rankup (tensor ntimes &key (at 0))
   "

diff --git a/source/vm/allocation.lisp b/source/vm/allocation.lisp
@@ -319,6 +319,11 @@ Reading from the last iseq, the function attributes T at each last reference"
     ;; [TODO] Share memory-pools between forward and backward
 
     (%in-place-vm-ops! iseq)
+    ;; Simulate Memory Pool optimization (a.k.a buffer sharing)
+    ;; cl-waffe2 do not allow tensor creation during runtime, that is;
+    ;;  Once the node was declared as : X Y Z -> Z
+    ;; tensors appeared in out-from also must be appeared in in-form.
+    ;; we use this info and reconnects all `InputTensor` to reduce the number of temporary values
     (simulate-memory-pool! iseq)
     (%in-place-vm-ops! iseq-bw-flat)
 

diff --git a/source/vm/generic-tensor/call-with-view.lisp b/source/vm/generic-tensor/call-with-view.lisp
@@ -271,7 +271,7 @@ A principle operator to extend your functions to higher arrays.
 (call-with-view function tensors &key (at-least-dim 1) (force-order nil) (lparallel nil))
 ```
 
-The function `call-with-view` generates a lisp code of `(loop for ...)` iteration for nd-arrays, which follows the optimal route, is parallelized, and later composable. Since generating an optimal `for(int i=0;i<size;i++){...}` route according to the given rank of tensors is one of the main concerns of JIT Compiler for Deep Learning Framework, this function is usually combined with the forward definition of `define-impl` macro. It is later compiled to lambda functions and used as nodes in cl-waffe2 IR.
+The function `call-with-view` generates a lisp code of `(loop for ...)` iteration for nd-arrays, which follows the optimal route, is parallelized, and later composable. Since generating an optimal `for(int i=0;i<size;i++){...}` route according to the given rank of tensors is one of the main concerns of JIT Compiler for Deep Learning Framexwork, this function is usually combined with the forward definition of `define-impl` macro. It is later compiled to lambda functions and used as nodes in cl-waffe2 IR.
 
 In the simplest case, `call-with-view` first deploys `(loop for...)` until the rank of given tensors reaches the given `at-least-dim`. After reaching `at-least-dim`, the function places the result of calling the given `function`.
 

diff --git a/source/vm/generic-tensor/do-compiled-loop.lisp b/source/vm/generic-tensor/do-compiled-loop.lisp
@@ -6,6 +6,21 @@
 ;; and minimize the costs and maximize the use of SIMD register
 ;; This computation will be done in runtime << 1e-5 sec and cached.
 ;; In the future release, integrate this file into call-with-view for simplicity
+;; Reading List:
+;;  Ref: https://inria.hal.science/inria-00551077/document
+;;       https://atrg.jp/ja/index.php?plugin=attach&pcmd=open&file=20171007_ATOS17_Sato.pdf&refer=ATOS17
+;;       https://arxiv.org/pdf/2005.04091.pdf
+;;       http://perso.ens-lyon.fr/christian.perez/_media/180308/cash_cohen.pdf
+;;       Polyhedral Compiler
+;;       https://ucbrise.github.io/cs294-ai-sys-sp19/assets/lectures/lec12/dl-compilers.pdf
+;;       https://arxiv.org/pdf/2005.04091.pdf
+;;
+
+;; As of this writing, features on iterations works enough as for element-wise operations
+;; but as for permuted tensors, it signifcantly reduces the performance.
+;; We can easily tackle this problem by using foreign DL Frameworks like oneDNN; but it restricts the flexibility of cl-waffe2
+;; Loop Oriented Optimization should not be limited to call foreign libraries; implement kernel-size=0 and
+;; JIT Compiling to Vectorized C++/CUDA Kernel?
 
 (defstruct (AbstractLoop
 	    (:conc-name aloop-)
@@ -57,9 +72,9 @@ If remaining loops are consisted of T or :broacast (i.e.: contiguous on memory),
 	    ;; Possible cases are: T T T... or broadcast broadcast ...
 	    ;;(not
 	    ;; (every #'(lambda (v)
-	;;		(eql (force-list v)
-	;;		     (force-list (car views))))
-	;;	    views))
+	    ;;		(eql (force-list v)
+	    ;;		     (force-list (car views))))
+	    ;;	    views))
 	    (some #'(lambda (v)
 		      (not (or (eql (force-list v) t)
 			       (eql (force-list v) :broadcast))))
@@ -107,7 +122,7 @@ Examples:
 		      (equal (butlast (shape (car tensors)) kernel-size) (butlast (shape x) kernel-size))))
 		 tensors)
 	  nil
-	  "Assertion Failed: solve-loop-order, Tensors must be the shape size, and not include symbols.")
+	  "Assertion Failed: solve-loop-order, Tensors must be the same shape size, and not include symbols.")
 
   (assert (every #'(lambda (x)
 		     (>= (the fixnum (dims x)) kernel-size))
@@ -230,7 +245,7 @@ Examples:
        (calc-strides (translate-adjustable-shape (original-shape tensor)) (order tensor))
        (tensor-stride tensor)
        (sync (tensor-stride tensor) (reverse (tensor-permute-order tensor))))))
-    
+
 
   (labels ((expand-helper (&optional (c 0) (offsets offsets))
 	     (declare (type fixnum c)
@@ -359,8 +374,8 @@ Iterates the given tensors in optimized order. The behavior is the same as the `
      (when (null (gethash ',cache-id *compiled-loop-table*))
        (setf (gethash ',cache-id *compiled-loop-table*) (make-hash-table :test #'equal)))
      (do-compiled-loop*
-       (maybe-solve-loop ',cache-id ,tensor-list ,kernel-size ,(not collapse) ,mode)
-     #'(lambda (,@views-bind)
-	 ,@body)
+	 (maybe-solve-loop ',cache-id ,tensor-list ,kernel-size ,(not collapse) ,mode)
+       #'(lambda (,@views-bind)
+	   ,@body)
        ,tensor-list)))
 
diff --git a/source/vm/generic-tensor/tensor.lisp b/source/vm/generic-tensor/tensor.lisp
@@ -332,7 +332,7 @@ The generic function current-backend-state is used to rendering (show-backends)
 
 (defun total (tensor)
   (declare (type AbstractTensor tensor))
-  (apply #'lazy-mulup (shape tensor)))
+  (apply #'* (translate-adjustable-shape (shape tensor))))
 
 (defun dims (tensor)
   (declare (type AbstractTensor tensor))