Spaces:

seanpedrickcase
/

topic_modelling

Running

seanpedrickcase commited on Aug 13, 2024

Commit

51ba1cb

1 Parent(s): cd6a3e0

Removed some requirements from Dockerfile for AWS deployment to reduce container size

Files changed (3) hide show

Dockerfile CHANGED Viewed

@@ -15,9 +15,9 @@ RUN mkdir /model && mkdir /model/rep && mkdir /model/embed
 WORKDIR /src
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
 # Gradio needs to be installed after due to conflict with spacy in requirements
 RUN pip install --no-cache-dir gradio==4.41.0
@@ -46,7 +46,7 @@ RUN mkdir -p /home/user/app/cache && chown -R user:user /home/user/app/cache
 #RUN git clone https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1 /home/user/app/model/embed
 #RUN rm -rf /home/user/app/model/embed/.git
-# Download the BGE embedding model during the build process. Create a directory for the model and download specific files using huggingface_hub
 COPY download_model.py /src/download_model.py
 RUN python /src/download_model.py
@@ -56,7 +56,7 @@ USER user
 # Set home to the user's home directory
 ENV HOME=/home/user \
 	PATH=/home/user/.local/bin:$PATH \
-    PYTHONPATH=$HOME/app \
 	PYTHONUNBUFFERED=1 \
 	PYTHONDONTWRITEBYTECODE=1 \
 	GRADIO_ALLOW_FLAGGING=never \
@@ -66,7 +66,6 @@ ENV HOME=/home/user \
 	GRADIO_THEME=huggingface \
 	AWS_STS_REGIONAL_ENDPOINT=regional \
 	GRADIO_OUTPUT_FOLDER='output/' \
-	#GRADIO_ROOT_PATH=/data-text-search \
 	NUMBA_CACHE_DIR=/home/user/app/cache \
 	SYSTEM=spaces

 WORKDIR /src
+COPY requirements_aws.txt .
+RUN pip install --no-cache-dir -r requirements_aws.txt
 # Gradio needs to be installed after due to conflict with spacy in requirements
 RUN pip install --no-cache-dir gradio==4.41.0
 #RUN git clone https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1 /home/user/app/model/embed
 #RUN rm -rf /home/user/app/model/embed/.git
+# Download the embedding model - Create a directory for the model and download specific files using huggingface_hub
 COPY download_model.py /src/download_model.py
 RUN python /src/download_model.py
 # Set home to the user's home directory
 ENV HOME=/home/user \
 	PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=/home/user/app \
 	PYTHONUNBUFFERED=1 \
 	PYTHONDONTWRITEBYTECODE=1 \
 	GRADIO_ALLOW_FLAGGING=never \
 	GRADIO_THEME=huggingface \
 	AWS_STS_REGIONAL_ENDPOINT=regional \
 	GRADIO_OUTPUT_FOLDER='output/' \
 	NUMBA_CACHE_DIR=/home/user/app/cache \
 	SYSTEM=spaces

funcs/representation_model.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 from bertopic.representation import LlamaCPP
-from llama_cpp import Llama
 from pydantic import BaseModel
 import torch.cuda
 from huggingface_hub import hf_hub_download
@@ -152,6 +152,9 @@ def create_representation_model(representation_type: str, llm_config: dict, hf_m
             print(error_message)
             representation_model = {"LLM":base_rep}
             return representation_model
         print("Generating LLM representation")
         # Use llama.cpp to load in model

 import os
 from bertopic.representation import LlamaCPP
 from pydantic import BaseModel
 import torch.cuda
 from huggingface_hub import hf_hub_download
             print(error_message)
             representation_model = {"LLM":base_rep}
             return representation_model
+        # Else import Llama
+        else:
+            from llama_cpp import Llama
         print("Generating LLM representation")
         # Use llama.cpp to load in model

requirements_aws.txt ADDED Viewed

+boto3==1.34.158
+bertopic==0.16.2
+spacy
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
+gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
+pyarrow==14.0.2
+openpyxl==3.1.2
+Faker==22.2.0
+presidio_analyzer==2.2.354
+presidio_anonymizer==2.2.354
+scipy==1.11.4
+polars==0.20.6
+sentence-transformers==3.0.1
+numpy==1.26.4