I was trying to register a langchain retrieverQA with MLflow, here is my code snippet for my model:
model_id='mistralai/Mistral-7B-Instruct-v0.1'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
print(device)
# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.float16
)
model_config = transformers.AutoConfig.from_pretrained(
model_id
)
# Prepare the tockenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id,
)
# Load the moodel
model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
config=model_config,
quantization_config=bnb_config,
device_map='auto'
)
generation_config = transformers.GenerationConfig.from_pretrained(model_id)
generation_config.max_new_tokens = 1024
generation_config.temperature = 0.0001
generation_config.top_p = 0.95
generation_config.do_sample = True
generation_config.repetition_penalty = 1.15
generation_config.pad_token_id=50256
# Define pipeline
generate_text = transformers.pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
return_full_text=True,
generation_config=generation_config,
)
llm = HuggingFacePipeline(pipeline=generate_text)
qa_chain = RetrievalQA.from_llm(llm, retriever=vectorstore.as_retriever(search_kwargs={"k": 5}), return_source_documents=True)
The retriever works perfectly:

Then I would like to register the qachain in MLflow to serve the model, here is the code:
mlflow.set_experiment("/main_RAG")
#RetrievalQA
#qa_chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(search_kwargs={"k": 5}),return_source_documents=True)
qa_chain = RetrievalQA.from_llm(llm, retriever=vectorstore.as_retriever(search_kwargs={"k": 5}), return_source_documents=True)
#qa_chain = RetrievalQA.from_llm(llm, retriever=vectorstore.as_retriever())
# Log the retrievalQA chain
def load_retriever(persist_directory):
vectorstore = FAISS.load_local(persist_directory, embeddings)
return vectorstore.as_retriever()
with mlflow.start_run() as run:
model_info = mlflow.langchain.log_model(
qa_chain,
artifact_path="retrieval_qa",
loader_fn=load_retriever,
persist_dir=persist_dir,
)
The experiment was properly registered in MLflow. Then I loaded the model to use it:
But when trying the following code I get an error:
loaded_model({"query":"what is the sun"})
Here is the error:
File /databricks/python/lib/python3.10/site-packages/langchain/llms/huggingface_pipeline.py:167, in HuggingFacePipeline._call(self, prompt, stop, run_manager, **kwargs)
160 def _call(
161 self,
162 prompt: str,
(...)
165 **kwargs: Any,
166 ) -> str:
--> 167 response = self.pipeline(prompt)
168 if self.pipeline.task == "text-generation":
169 # Text generation return includes the starter text.
170 text = response[0]["generated_text"][len(prompt) :]
TypeError: 'NoneType' object is not callable
It seems like the pipeline component was not logged by MLflow
The original qa_chain model had the pipeline:

Here is the packages version:
Hope I can get some help on this,
Best,
Ch.


