I know that quantization use int8 to reduce the usage of memory But when I print the weight, it is float16 so how come quantization helps accelerate? do they convert float to int only when doing matrix multiplication and then convert them back to float?
here is my code
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoConfig
model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-64g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto",
trust_remote_code=False,
revision="main")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
# check if model is quantized
# print a weight
prompt = "Tell me about AI"
prompt_template= prompt
print("\n\n*** Generate:")
input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda(1)
print(model.lm_head.weight)
and here is the output
*** Generate:
Parameter containing:
tensor([[-0.0036, 0.0027, -0.0074, ..., 0.0039, -0.0084, 0.0065],
[-0.0311, 0.0449, -0.0029, ..., -0.0228, 0.0147, 0.0320],
[-0.0125, 0.0014, 0.0188, ..., -0.0264, 0.0156, -0.0073],
...,
[-0.0294, -0.0172, -0.0029, ..., 0.0140, -0.0116, -0.0234],
[ 0.0204, 0.0239, 0.0272, ..., 0.0048, -0.0097, -0.0064],
[ 0.0081, -0.0057, 0.0082, ..., -0.0282, -0.0164, 0.0311]],
device='cuda:1', dtype=torch.float16, requires_grad=True)