running inference session with position getter/setter (#594)

justheuristic · xtinkt · web-flow · commit a59e38a57807 · 2024-07-21T23:06:50.000+03:00
* Add option to rollback inference for a certain number of steps (#588) * fix * fix * fix * fix * fix * fix * style * test running inference session with position getter/setter * add assertion * fix typo --------- Co-authored-by: Anton Sinitsin <30695750+xtinkt@users.noreply.github.com>
diff --git a/src/petals/client/inference_session.py b/src/petals/client/inference_session.py
@@ -83,14 +83,24 @@ async def _read_inputs_from_queue(queue: asyncio.Queue, input_timeout: Optional[
             if not next_input_message.uid and not next_input_message.tensors:
                 break  # this message means "done sending"
 
+    @property
+    def position(self):
+        return self._position
+
+    @position.setter
+    def position(self, start_from_position: int):
+        assert start_from_position <= self._position
+        self._position = start_from_position
+        if self.history is not None and self.history.shape[1] >= start_from_position:
+            self.history = self.history[:, :start_from_position, :] if start_from_position > 0 else None
+
     def step(
         self,
         inputs: torch.Tensor,
         prompts: torch.Tensor,
         hypo_ids: torch.LongTensor,
         *,
         step_id: str,
-        start_from_position: int,
     ) -> torch.Tensor:
         """
         Inference step: send a chunk of input tensors and receive a chunk of outputs
@@ -127,8 +137,8 @@ def step(
         request_metadata = dict(session_id=self.session_id, step_id=step_id)
         if not self.stepped:
             request_metadata.update(self.session_metadata)
-        if start_from_position is not None:
-            request_metadata["start_from_position"] = start_from_position
+        if self._position is not None:
+            request_metadata["start_from_position"] = self._position
         elif self.config.use_server_to_server:
             next_servers = self._collect_next_servers()
             if next_servers:
@@ -235,6 +245,13 @@ def num_blocks(self) -> int:
     def position(self) -> int:
         return self._position
 
+    @position.setter
+    def position(self, start_from_position: int) -> None:
+        self._position = start_from_position
+        for session in self._server_sessions:
+            assert isinstance(session, _ServerInferenceSession)
+            session.position = start_from_position
+
     def _enter_server_sessions(self, chosen_spans: List[RemoteSpanInfo]) -> List[_ServerInferenceSession]:
         server_sessions = []
         try:
@@ -275,12 +292,7 @@ def step(
         inputs: torch.Tensor,
         prompts: Optional[torch.Tensor] = None,
         hypo_ids: Optional[torch.Tensor] = None,
-        start_from_position: Optional[int] = None,
     ) -> torch.Tensor:
-
-        if start_from_position is not None:
-            self._position = start_from_position
-
         assert not self._closed
         if torch.is_grad_enabled():
             logger.warning("Running inference session with grad enabled. Gradients will *not* be propagated correctly.")
@@ -324,12 +336,12 @@ def step(
                         self._update_sequence(server_idx, block_idx, attempt_no)
 
                     server_session = self._server_sessions[server_idx]
+                    assert server_session.position == self.position
                     inputs = server_session.step(
                         inputs,
                         prompts[server_session.span.start : server_session.span.end],
                         hypo_ids,
                         step_id=step_id,
-                        start_from_position=start_from_position,
                     )
 
                     server_idx += 1
diff --git a/tests/test_speculative_generation.py b/tests/test_speculative_generation.py
@@ -26,7 +26,9 @@ def test_remote_block_with_cache_invalidation_exact_match(atol_forward=1e-4, ato
     with torch.inference_mode():
         with remote_block.inference_session(max_length=inputs.shape[1]) as sess:
             initial_outputs_inference = sess.step(inputs)
-            secondary_outputs_inference = sess.step(short_inputs[:, 2:, :], start_from_position=2)
+
+            sess.position = 2
+            secondary_outputs_inference = sess.step(short_inputs[:, 2:, :])
             result = torch.cat([initial_outputs_inference[:, :2, :], secondary_outputs_inference], dim=1)
 
     ref_block = load_pretrained_block(MODEL_NAME, block_index, torch_dtype=torch.float32)