From 4887a4c1d817a0553b3e0a75a39c5847df7d0b25 Mon Sep 17 00:00:00 2001 From: River Zhou Date: Wed, 28 Jun 2023 18:05:13 +0800 Subject: [PATCH 01/10] Update .gitignore and submodule --- .gitignore | 2 +- vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) mode change 160000 => 120000 vendor/llama.cpp diff --git a/.gitignore b/.gitignore index 36ed7f7fdc..17c4b9a1bd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ .vscode/ - +*.swp _skbuild/ .envrc diff --git a/vendor/llama.cpp b/vendor/llama.cpp deleted file mode 160000 index 447ccbe8c3..0000000000 --- a/vendor/llama.cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 447ccbe8c39332fcdd0d98a041b6e2ff6f06219d diff --git a/vendor/llama.cpp b/vendor/llama.cpp new file mode 120000 index 0000000000..62e3711ded --- /dev/null +++ b/vendor/llama.cpp @@ -0,0 +1 @@ +../../llama.cpp \ No newline at end of file From 8e78c8fd82658c1af97aa5b9f025da3c44999867 Mon Sep 17 00:00:00 2001 From: River Zhou Date: Wed, 28 Jun 2023 20:52:38 +0800 Subject: [PATCH 02/10] Print Chat Log to Console --- llama_cpp/server/app.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ef319c7e0a..2d206ff1bd 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -478,6 +478,8 @@ async def create_chat_completion( } kwargs = body.dict(exclude=exclude) + log = kwargs.copy() + if body.logit_bias is not None: kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type), @@ -490,11 +492,22 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): async with inner_send_chan: try: iterator: Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs) # type: ignore + streamRole = '' + streamContent = '' async for chat_chunk in iterate_in_threadpool(iterator): await inner_send_chan.send(dict(data=json.dumps(chat_chunk))) + + if 'role' in chat_chunk['choices'][0]['delta']: + streamRole = chat_chunk['choices'][0]['delta']['role'] + if 'content' in chat_chunk['choices'][0]['delta']: + streamContent += chat_chunk['choices'][0]['delta']['content'] + if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) + log['messages'].append({'role':streamRole, 'content':streamContent}) + print(json.dumps(log,indent=4)) + except anyio.get_cancelled_exc_class() as e: print("disconnected") with anyio.move_on_after(1, shield=True): From 6c246a50fdaa4b5beeb3e4cb255608b756ffabb0 Mon Sep 17 00:00:00 2001 From: River Zhou Date: Wed, 28 Jun 2023 21:24:45 +0800 Subject: [PATCH 03/10] Funciont: Add redis support for saving chat log --- llama_cpp/server/app.py | 26 ++++++++++++++++++++++++-- setup.py | 2 +- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 2d206ff1bd..1cfec0139c 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,5 +1,6 @@ import json import multiprocessing +import redis from threading import Lock from functools import partial from typing import Iterator, List, Optional, Union, Dict @@ -85,13 +86,22 @@ class Settings(BaseSettings): port: int = Field( default=8000, description="Listen port" ) + redishost: str = Field( + default="None", description="Redis server address" + ) + redisport: int = Field( + default=6379, description="Redis server port" + ) + redisdb: int = Field( + default=0, description="Redis server db" + ) router = APIRouter() settings: Optional[Settings] = None llama: Optional[llama_cpp.Llama] = None - +rediscon: Optional[redis.StrictRedis] = None def create_app(settings: Optional[Settings] = None): if settings is None: @@ -108,6 +118,14 @@ def create_app(settings: Optional[Settings] = None): allow_headers=["*"], ) app.include_router(router) + + if settings.redishost != 'None': + global rediscon + try: + redscon = redis.StrictRedis(host=settings.redishost, port=settings.redisport, db=settings.redisdb) + except Exception as e: + print(e) + global llama llama = llama_cpp.Llama( model_path=settings.model, @@ -506,7 +524,11 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) log['messages'].append({'role':streamRole, 'content':streamContent}) - print(json.dumps(log,indent=4)) + + #print(json.dumps(log,indent=4)) + if rediscon is not None: + logstr = json.dumps(log) + rediscon.rpush('llama.cpp', logstr) except anyio.get_cancelled_exc_class() as e: print("disconnected") diff --git a/setup.py b/setup.py index fbd5551442..7a6b6a2779 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ packages=["llama_cpp", "llama_cpp.server"], install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], extras_require={ - "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], + "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3", "redis[hiredis]>= 4.1.0"], }, python_requires=">=3.7", classifiers=[ From b7feecb0189454a8c55f0b9b9d6ecf5778f959e6 Mon Sep 17 00:00:00 2001 From: River Zhou Date: Wed, 28 Jun 2023 21:25:17 +0800 Subject: [PATCH 04/10] Add build script --- build.sh | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100755 build.sh diff --git a/build.sh b/build.sh new file mode 100755 index 0000000000..697243be10 --- /dev/null +++ b/build.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +rm -rf dist _skbuild + +CMAKE_ARGS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_DMMV_F16=on -DLLAMA_NATIVE=on -DLLAMA_LTO=on -DLLAMA_AVX512=on -DLLAMA_AVX512_VBMI=on -DLLAMA_AVX512_VNNI=on " FORCE_CMAKE=1 python setup.py bdist_wheel + From 0135e051f22e7e27845adce9fe029dbcbe0b881e Mon Sep 17 00:00:00 2001 From: River Zhou Date: Wed, 28 Jun 2023 21:34:51 +0800 Subject: [PATCH 05/10] Bugfix: Redis: Typo --- llama_cpp/server/app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 1cfec0139c..9aa70ec7b8 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -122,7 +122,8 @@ def create_app(settings: Optional[Settings] = None): if settings.redishost != 'None': global rediscon try: - redscon = redis.StrictRedis(host=settings.redishost, port=settings.redisport, db=settings.redisdb) + rediscon = redis.StrictRedis(host=settings.redishost, port=settings.redisport, db=settings.redisdb) + print(rediscon) except Exception as e: print(e) From 6427f8572ac33585c1114ea946a9a2d7846389f6 Mon Sep 17 00:00:00 2001 From: River Zhou Date: Wed, 28 Jun 2023 22:30:49 +0800 Subject: [PATCH 06/10] Function: Script for save log from redis --- log/llama.log.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100755 log/llama.log.py diff --git a/log/llama.log.py b/log/llama.log.py new file mode 100755 index 0000000000..c8ea61c2bf --- /dev/null +++ b/log/llama.log.py @@ -0,0 +1,26 @@ +#!/bin/env python3 + +import redis +import json +import os +from datetime import datetime + +logDir = '/var/log/llama.cpp' +logJsonName = 'llama.log' +logJsonFile = os.path.join(logDir, logJsonName) + +if not os.path.exists(logDir): + os.makedirs(logDir) + +rediscon = redis.StrictRedis(host='127.0.0.1', port=6379, db=0) +print(rediscon) + +with open(logJsonFile, 'a') as fJson: + while True: + logDataStr = rediscon.blpop('llama.cpp')[1].decode('utf8') + ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + logDict = {'time':ts, 'log':json.loads(logDataStr)} + logJsonStr = json.dumps(logDict, indent=4, ensure_ascii=False) + fJson.write(logJsonStr + '\n') + fJson.flush() + From a965112da6d2b3fadb29c61f6475feec550f0fb2 Mon Sep 17 00:00:00 2001 From: River Zhou Date: Wed, 28 Jun 2023 22:37:28 +0800 Subject: [PATCH 07/10] Tuning: Improve log format --- log/llama.log.py | 1 + 1 file changed, 1 insertion(+) diff --git a/log/llama.log.py b/log/llama.log.py index c8ea61c2bf..71342dd757 100755 --- a/log/llama.log.py +++ b/log/llama.log.py @@ -22,5 +22,6 @@ logDict = {'time':ts, 'log':json.loads(logDataStr)} logJsonStr = json.dumps(logDict, indent=4, ensure_ascii=False) fJson.write(logJsonStr + '\n') + fJson.write('='*50 + '\n') fJson.flush() From 773873800b35560eb614de84671e7b4e6653a9e4 Mon Sep 17 00:00:00 2001 From: River Zhou Date: Sun, 9 Jul 2023 16:21:15 +0800 Subject: [PATCH 08/10] Add build script for windows --- build.bat | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 build.bat diff --git a/build.bat b/build.bat new file mode 100644 index 0000000000..5a04d4d989 --- /dev/null +++ b/build.bat @@ -0,0 +1,5 @@ + +set CMAKE_ARGS=-DLLAMA_CUBLAS=on -DLLAMA_CUDA_DMMV_F16=on -DLLAMA_NATIVE=on -DLLAMA_AVX512=on -DLLAMA_AVX512_VBMI=on -DLLAMA_AVX512_VNNI=on +set FORCE_CMAKE=1 +python setup.py bdist_wheel + From e05aa881b899234cc052e2621a3def9451651727 Mon Sep 17 00:00:00 2001 From: River Zhou Date: Tue, 11 Jul 2023 12:58:13 +0800 Subject: [PATCH 09/10] update .gitignore, add nohup.out --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 17c4b9a1bd..2e701b11d3 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ *.swp _skbuild/ +nohup.out + .envrc models/ From c3c74c27c8971df673d334147f3c1f79a6cb1d64 Mon Sep 17 00:00:00 2001 From: River Zhou Date: Wed, 12 Jul 2023 17:26:23 +0800 Subject: [PATCH 10/10] Function: Save log in non steam mode --- llama_cpp/server/app.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 9aa70ec7b8..ee5202273c 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -548,6 +548,20 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): completion: llama_cpp.ChatCompletion = await run_in_threadpool( llama.create_chat_completion, **kwargs # type: ignore ) + #print(json.dumps(completion,indent=4)) + + messageRole = '' + messageContent = '' + if 'role' in completion['choices'][0]['message']: + messageRole = completion['choices'][0]['message']['role'] + if 'content' in completion['choices'][0]['message']: + messageContent = completion['choices'][0]['message']['content'] + log['messages'].append({'role':messageRole, 'content':messageContent}) + + #print(json.dumps(log,indent=4)) + if rediscon is not None: + logstr = json.dumps(log) + rediscon.rpush('llama.cpp', logstr) return completion