Skip to content

Commit 7218c7b

Browse files
committed
Merge remote-tracking branch 'upstream/master' into bert
2 parents e0e14e3 + 6e99f2a commit 7218c7b

File tree

8 files changed

+120
-10
lines changed

8 files changed

+120
-10
lines changed

.github/workflows/build.yml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,47 @@ jobs:
184184
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
185185
cmake --build . --config Release -j $(nproc)
186186
187+
ubuntu-22-cmake-sycl-fp16:
188+
runs-on: ubuntu-22.04
189+
190+
continue-on-error: true
191+
192+
steps:
193+
- uses: actions/checkout@v2
194+
195+
- name: add oneAPI to apt
196+
shell: bash
197+
run: |
198+
cd /tmp
199+
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
200+
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
201+
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
202+
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
203+
204+
- name: install oneAPI dpcpp compiler
205+
shell: bash
206+
run: |
207+
sudo apt update
208+
sudo apt install intel-oneapi-compiler-dpcpp-cpp
209+
210+
- name: install oneAPI MKL library
211+
shell: bash
212+
run: |
213+
sudo apt install intel-oneapi-mkl-devel
214+
215+
- name: Clone
216+
id: checkout
217+
uses: actions/checkout@v3
218+
219+
- name: Build
220+
id: cmake_build
221+
run: |
222+
source /opt/intel/oneapi/setvars.sh
223+
mkdir build
224+
cd build
225+
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
226+
cmake --build . --config Release -j $(nproc)
227+
187228
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
188229
# how to debug it.
189230
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124

common/sampling.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ static void sampler_queue(
132132
const float temp = params.temp;
133133
const float dynatemp_range = params.dynatemp_range;
134134
const float dynatemp_exponent = params.dynatemp_exponent;
135-
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
135+
const int32_t top_k = params.top_k;
136136
const float top_p = params.top_p;
137137
const float min_p = params.min_p;
138138
const float tfs_z = params.tfs_z;

convert-hf-to-gguf.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,17 +1082,76 @@ def set_gguf_parameters(self):
10821082
self.gguf_writer.add_name("MiniCPM")
10831083
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
10841084
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1085-
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
10861085
self.gguf_writer.add_block_count(block_count)
1086+
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1087+
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
10871088
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
10881089
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
10891090
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
10901091
self.gguf_writer.add_file_type(self.ftype)
1091-
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
10921092

10931093
def set_vocab(self):
10941094
self._set_vocab_hf()
10951095

1096+
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
1097+
if n_kv_head is not None and n_head != n_kv_head:
1098+
n_head //= n_kv_head
1099+
1100+
return (
1101+
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
1102+
.swapaxes(1, 2)
1103+
.reshape(weights.shape)
1104+
)
1105+
1106+
def write_tensors(self):
1107+
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
1108+
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
1109+
n_head = self.hparams.get("num_attention_heads")
1110+
n_kv_head = self.hparams.get("num_key_value_heads")
1111+
for name, data_torch in self.get_tensors():
1112+
# we don't need these
1113+
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
1114+
continue
1115+
1116+
old_dtype = data_torch.dtype
1117+
1118+
# convert any unsupported data types to float32
1119+
if data_torch.dtype not in (torch.float16, torch.float32):
1120+
data_torch = data_torch.to(torch.float32)
1121+
1122+
# HF models permute some of the tensors, so we need to undo that
1123+
if name.endswith(("q_proj.weight")):
1124+
data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
1125+
if name.endswith(("k_proj.weight")):
1126+
data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
1127+
1128+
data = data_torch.squeeze().numpy()
1129+
1130+
# map tensor names
1131+
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
1132+
if new_name is None:
1133+
print(f"Can not map tensor {name!r}")
1134+
sys.exit()
1135+
1136+
n_dims = len(data.shape)
1137+
data_dtype = data.dtype
1138+
1139+
# if f32 desired, convert any float16 to float32
1140+
if self.ftype == 0 and data_dtype == np.float16:
1141+
data = data.astype(np.float32)
1142+
1143+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
1144+
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
1145+
data = data.astype(np.float32)
1146+
1147+
# if f16 desired, convert any float32 2-dim weight tensors to float16
1148+
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
1149+
data = data.astype(np.float16)
1150+
1151+
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
1152+
1153+
self.gguf_writer.add_tensor(new_name, data)
1154+
10961155

10971156
class QwenModel(Model):
10981157
@staticmethod

examples/llava/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@ Build with cmake or run `make llava-cli` to build it.
1414
After building, run: `./llava-cli` to see the usage. For example:
1515

1616
```sh
17-
./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
17+
./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
1818
```
1919

2020
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
2121

2222
## Model conversion
2323

24-
- Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally:
24+
- Clone `llava-v15-7b` and `clip-vit-large-patch14-336` locally:
2525

2626
```sh
2727
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
@@ -38,7 +38,7 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
3838
3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
3939

4040
```sh
41-
python ./examples/llava/convert-image-encoder-to-gguf -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
41+
python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
4242
```
4343

4444
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:

ggml-sycl.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12148,7 +12148,8 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
1214812148
const int64_t src1_ncols, const int64_t src1_padded_row_size,
1214912149
const dpct::queue_ptr &stream) {
1215012150

12151-
const int64_t ne00 = src0->ne[0];
12151+
GGML_TENSOR_BINARY_OP_LOCALS
12152+
1215212153
const int64_t row_diff = row_high - row_low;
1215312154

1215412155
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
@@ -12167,8 +12168,9 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
1216712168
} else {
1216812169
src1_dfloat = src1_dfloat_a.alloc(ne00);
1216912170
ggml_cpy_f32_f16_sycl((const char *)src1_ddf_i, (char *)src1_dfloat,
12170-
ne00, ne00, 1, sizeof(float), 0, 0, ne00, 1,
12171-
sizeof(sycl::half), 0, 0, stream);
12171+
ne00, ne00, ne01, ne02, nb00, nb01, nb02,
12172+
nb03, ne10, ne11, ne12, nb10, nb11, nb12,
12173+
nb13, stream);
1217212174
}
1217312175
}
1217412176
#else

llama.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2981,6 +2981,8 @@ static void llm_load_hparams(
29812981
} break;
29822982
case LLM_ARCH_MINICPM:
29832983
{
2984+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2985+
29842986
switch (hparams.n_layer) {
29852987
case 40: model.type = e_model::MODEL_2B; break;
29862988
default: model.type = e_model::MODEL_UNKNOWN;
@@ -9032,6 +9034,10 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
90329034

90339035
const int64_t t_start_sample_us = ggml_time_us();
90349036

9037+
if (k <= 0) {
9038+
k = candidates->size;
9039+
}
9040+
90359041
k = std::max(k, (int) min_keep);
90369042
k = std::min(k, (int) candidates->size);
90379043

tests/.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
*
22
!*.*
3-
test-c.o
3+
*.o

tests/test-sampling.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,8 @@ int main(void) {
235235

236236
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
237237
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
238+
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
239+
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
238240

239241
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
240242
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy