|
|
@@ -2,47 +2,104 @@
|
|
|
import json
|
|
|
import os
|
|
|
import sys
|
|
|
+import logging
|
|
|
+from datetime import datetime, timedelta
|
|
|
+from io import BytesIO
|
|
|
+
|
|
|
import openai
|
|
|
-from llama_index import GPTVectorStoreIndex, download_loader
|
|
|
-# from llama_index.chat_engine import SimpleChatEngine
|
|
|
-from datetime import datetime
|
|
|
-from django.db import connection, models
|
|
|
-from django.db.models import Q
|
|
|
-import re
|
|
|
+from llama_index import VectorStoreIndex, Document, StorageContext, load_index_from_storage
|
|
|
|
|
|
from adm.constants import CTS as cts
|
|
|
from adm.storage import MediaStorage
|
|
|
from adm.services import ParameterService
|
|
|
|
|
|
+logger = logging.getLogger('dsp')
|
|
|
+
|
|
|
class Chatbot():
|
|
|
def __init__(self) -> None:
|
|
|
objParameter = ParameterService()
|
|
|
openai.api_key = objParameter.getParameterByKey("OPENAI_API_KEY").value
|
|
|
media_storage = MediaStorage()
|
|
|
- self.bucket = media_storage.storage.bucket.name
|
|
|
- self.storage = media_storage.storage
|
|
|
- self.credentials = json.dumps(media_storage.credentials)
|
|
|
- # self.docs_path = objParameter.getParameterByKey("CHATBOT_DOCS_PATH").value
|
|
|
- self.docs_path = '/chatbot/docs'
|
|
|
- self.endpoint = 'https://' + cts.GCP_ST_ACCESS_KEY_ID
|
|
|
+ self.media_storage = media_storage
|
|
|
+ self.docs_path = 'chatbot/docs/'
|
|
|
+
|
|
|
+ # load index
|
|
|
+ self.ai_storage_context = None
|
|
|
+ self.ai_index = None
|
|
|
+ try:
|
|
|
+ self.ai_storage_context = StorageContext.from_defaults(persist_dir='./storage')
|
|
|
+ self.ai_index = load_index_from_storage(self.ai_storage_context)
|
|
|
+ except BaseException as error:
|
|
|
+ # exc_type, exc_obj, exc_tb = sys.exc_info()
|
|
|
+ # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
|
|
+ # strErro = "ERRO: %s | %s | %s | %s" % (error, exc_type, fname, exc_tb.tb_lineno)
|
|
|
+ logger.info('Chatbot() init: '+error)
|
|
|
+ pass
|
|
|
pass
|
|
|
|
|
|
- def train (self):
|
|
|
- OpendalGcsReader = download_loader("OpendalGcsReader")
|
|
|
-
|
|
|
- loader = OpendalGcsReader(
|
|
|
- bucket=self.bucket,
|
|
|
- path=self.docs_path,
|
|
|
- endpoint=self.endpoint,
|
|
|
- credentials=self.credentials,
|
|
|
- )
|
|
|
- documents = loader.load_data()
|
|
|
+ def train (self, chat=None):
|
|
|
+ docs = []
|
|
|
+ if chat:
|
|
|
+ # use chat to build up on existing store
|
|
|
+ # save chat as doc in storage
|
|
|
+ if not 'id' in chat:
|
|
|
+ chat['id'] = datetime.utcnow().replace(microsecond=0).isoformat()
|
|
|
+ if not 'type' in chat:
|
|
|
+ chat['type'] = 'chat'
|
|
|
+ if not 'category' in chat:
|
|
|
+ chat['category'] = 'uncategorized'
|
|
|
+
|
|
|
+ blob_name = self.docs_path + 'chat-' + chat['id'] + '.json'
|
|
|
+ text = json.dumps(chat, ensure_ascii=False)
|
|
|
+
|
|
|
+ path_bucket = os.path.join(self.media_storage.location, '/', blob_name)
|
|
|
+ self.media_storage.save(path_bucket, BytesIO(text.encode("utf-8")))
|
|
|
|
|
|
- # construct the index with the txt document
|
|
|
- index = GPTVectorStoreIndex.from_documents(documents)
|
|
|
+ docs.append({
|
|
|
+ 'id_': chat['id'],
|
|
|
+ 'text': text,
|
|
|
+ 'metadata': {
|
|
|
+ 'filename': blob_name,
|
|
|
+ 'category': chat['category']
|
|
|
+ }
|
|
|
+ })
|
|
|
+
|
|
|
+ else:
|
|
|
+ blobs = self.media_storage.list(self.docs_path)
|
|
|
+ for blob in blobs:
|
|
|
+ text = blob.download_as_bytes().decode('utf-8')
|
|
|
+ if len(text)>0:
|
|
|
+ obj = json.loads(text) if text.startswith('{') else text
|
|
|
+
|
|
|
+ if not 'category' in obj:
|
|
|
+ obj['category'] = 'uncategorized'
|
|
|
+
|
|
|
+ docs.append({
|
|
|
+ 'id_': obj['id'],
|
|
|
+ 'text': text,
|
|
|
+ 'metadata': {
|
|
|
+ 'filename': blob.name,
|
|
|
+ 'category': obj['category']
|
|
|
+ }
|
|
|
+ })
|
|
|
+
|
|
|
+ # parse documents
|
|
|
+ ai_documents = []
|
|
|
+ for doc in docs:
|
|
|
+ ai_documents.append(Document(**doc))
|
|
|
+
|
|
|
+ # create/add to index
|
|
|
+ if self.ai_index is None:
|
|
|
+ self.ai_index = VectorStoreIndex.from_documents(ai_documents, show_progress=True)
|
|
|
+ else:
|
|
|
+ for ai_doc in ai_documents:
|
|
|
+ self.ai_index.insert(ai_doc)
|
|
|
+
|
|
|
+ # save index to disk
|
|
|
+ self.ai_index.storage_context.persist()
|
|
|
|
|
|
- chat_engine = index.as_chat_engine(
|
|
|
- chat_mode='condense_question',
|
|
|
+ chat_engine = self.ai_index.as_chat_engine(
|
|
|
+ chat_mode='react',
|
|
|
verbose=True
|
|
|
)
|
|
|
|