Skip to content

Commit

Permalink
Updated server impl
Browse files Browse the repository at this point in the history
  • Loading branch information
Maxusmusti committed Jan 25, 2024
1 parent 15a8805 commit 079f7da
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 6 deletions.
9 changes: 4 additions & 5 deletions language/llama2-70b/SUT.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,14 +460,13 @@ def stream_api_grpc(self, input, response_ids):
first = True
resps = self.grpc_client.make_request_stream(input, model_id=self.api_model_name)
for resp in resps:
if resp.text:
tokens = self.tokenizer(resp.text)["input_ids"][1:]
if resp.tokens:
token = self.llama_vocab[resp.tokens[0].text]
if first:
self.first_token_queue.put((tokens[0], response_ids[0]))
token_cache.extend(tokens[1:])
self.first_token_queue.put((token, response_ids[0]))
first = False
else:
token_cache.extend(tokens)
token_cache.append(token)
return token_cache

def async_process_query(self, input_ids_tensor, qitem_id):
Expand Down
2 changes: 1 addition & 1 deletion language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ spec:
restartPolicy: Never
containers:
- name: mlperf-env
image: quay.io/meyceoz/mlperf-inference:grpc-stream
image: quay.io/meyceoz/mlperf-inference:v5
resources:
requests:
memory: 20000Mi
Expand Down
3 changes: 3 additions & 0 deletions language/llama2-70b/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ def make_request_stream(self, text: str, model_id: str = "flan-t5-small"):
stopping=generation_pb2_grpc.generation__pb2.StoppingCriteria(
max_new_tokens=1024,
min_new_tokens=1
),
response=generation_pb2_grpc.generation__pb2.ResponseOptions(
generated_tokens=True
)
)
)
Expand Down

0 comments on commit 079f7da

Please sign in to comment.