Skip to content

Commit

Permalink
test
Browse files Browse the repository at this point in the history
  • Loading branch information
ydli-ai committed Jan 8, 2024
1 parent c703d03 commit 294315f
Show file tree
Hide file tree
Showing 9 changed files with 427 additions and 15 deletions.
12 changes: 6 additions & 6 deletions models/clip/base-16_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
"hidden_act": "gelu",
"heads_num": 8,
"layers_num": 12,
"max_seq_length": 512,
"max_seq_length": 77,
"embedding": ["word", "pos"],
"encoder": "transformer",
"mask": "fully_visible",
"remove_embedding_layernorm": false,
"layernorm_positioning": "post",
"pooling": "first"
"mask": "causal",
"remove_embedding_layernorm": true,
"layernorm_positioning": "pre",
"pooling": "last"
},

"stream_1": {
Expand All @@ -26,7 +26,7 @@
"embedding": ["patch", "pos"],
"encoder": "transformer",
"mask": "fully_visible",
"remove_embedding_layernorm": true,
"remove_embedding_layernorm": false,
"layernorm_positioning": "pre",
"pooling": "first"
},
Expand Down
12 changes: 6 additions & 6 deletions models/clip/base-32_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
"hidden_act": "gelu",
"heads_num": 8,
"layers_num": 12,
"max_seq_length": 512,
"max_seq_length": 77,
"embedding": ["word", "pos"],
"encoder": "transformer",
"mask": "fully_visible",
"remove_embedding_layernorm": false,
"layernorm_positioning": "post",
"pooling": "first"
"mask": "causal",
"remove_embedding_layernorm": true,
"layernorm_positioning": "pre",
"pooling": "last"
},

"stream_1": {
Expand All @@ -26,7 +26,7 @@
"embedding": ["patch", "pos"],
"encoder": "transformer",
"mask": "fully_visible",
"remove_embedding_layernorm": true,
"remove_embedding_layernorm": false,
"layernorm_positioning": "pre",
"pooling": "first"
},
Expand Down
45 changes: 45 additions & 0 deletions models/clip/large-14_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"stream_0": {
"emb_size": 768,
"feedforward_size": 3072,
"hidden_size": 768,
"hidden_act": "gelu",
"heads_num": 12,
"layers_num": 12,
"max_seq_length": 77,
"embedding": ["word", "pos"],
"encoder": "transformer",
"mask": "causal",
"remove_embedding_layernorm": true,
"layernorm_positioning": "pre",
"pooling": "last"
},

"stream_1": {
"emb_size": 1024,
"feedforward_size": 4096,
"hidden_size": 1024,
"hidden_act": "gelu_fast",
"heads_num": 16,
"layers_num": 24,
"max_seq_length": 257,
"embedding": ["patch", "pos"],
"encoder": "transformer",
"mask": "fully_visible",
"remove_embedding_layernorm": false,
"layernorm_positioning": "pre",
"pooling": "first"
},

"data_processor": "clip",
"embedding": ["dual"],
"encoder": "dual",
"target": ["clr"],
"image_height": 224,
"image_width": 224,
"patch_size": 14,
"feature_size": 768,
"projection": true,
"tie_weights": false,
"dropout": 0.0
}
7 changes: 7 additions & 0 deletions models/clip_special_tokens_map.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"pad_token": "<|endoftext|>",
"unk_token": "<|endoftext|>",
"cls_token": "<|startoftext|>",
"sep_token": "<|endoftext|>",
"mask_token": "<|endoftext|>"
}
125 changes: 125 additions & 0 deletions scripts/convert_clip_from_huggingface_to_tencentpretrain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import argparse
import collections
import torch


def convert_clip_transformer(input_model, output_model, layers_num):

for i in range(layers_num):
output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.0.weight"] = \
input_model["text_model.encoder.layers." + str(i) + ".self_attn.q_proj.weight"]
output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.0.bias"] = \
input_model["text_model.encoder.layers." + str(i) + ".self_attn.q_proj.bias"]

output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.1.weight"] = \
input_model["text_model.encoder.layers." + str(i) + ".self_attn.k_proj.weight"]
output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.1.bias"] = \
input_model["text_model.encoder.layers." + str(i) + ".self_attn.k_proj.bias"]

output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.2.weight"] = \
input_model["text_model.encoder.layers." + str(i) + ".self_attn.v_proj.weight"]
output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.2.bias"] = \
input_model["text_model.encoder.layers." + str(i) + ".self_attn.v_proj.bias"]

output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.final_linear.weight"] = \
input_model["text_model.encoder.layers." + str(i) + ".self_attn.out_proj.weight"]
output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.final_linear.bias"] = \
input_model["text_model.encoder.layers." + str(i) + ".self_attn.out_proj.bias"]

output_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_1.gamma"] = \
input_model["text_model.encoder.layers." + str(i) + ".layer_norm1.weight"]
output_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_1.beta"] = \
input_model["text_model.encoder.layers." + str(i) + ".layer_norm1.bias"]

output_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_1.weight"] = \
input_model["text_model.encoder.layers." + str(i) + ".mlp.fc1.weight"]
output_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_1.bias"] = \
input_model["text_model.encoder.layers." + str(i) + ".mlp.fc1.bias"]
output_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_2.weight"] = \
input_model["text_model.encoder.layers." + str(i) + ".mlp.fc2.weight"]
output_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_2.bias"] = \
input_model["text_model.encoder.layers." + str(i) + ".mlp.fc2.bias"]

output_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_2.gamma"] = \
input_model["text_model.encoder.layers." + str(i) + ".layer_norm2.weight"]
output_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_2.beta"] = \
input_model["text_model.encoder.layers." + str(i) + ".layer_norm2.bias"]


output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.0.weight"] = \
input_model["vision_model.encoder.layers." + str(i) + ".self_attn.q_proj.weight"]
output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.0.bias"] = \
input_model["vision_model.encoder.layers." + str(i) + ".self_attn.q_proj.bias"]

output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.1.weight"] = \
input_model["vision_model.encoder.layers." + str(i) + ".self_attn.k_proj.weight"]
output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.1.bias"] = \
input_model["vision_model.encoder.layers." + str(i) + ".self_attn.k_proj.bias"]

output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.2.weight"] = \
input_model["vision_model.encoder.layers." + str(i) + ".self_attn.v_proj.weight"]
output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.2.bias"] = \
input_model["vision_model.encoder.layers." + str(i) + ".self_attn.v_proj.bias"]

output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.final_linear.weight"] = \
input_model["vision_model.encoder.layers." + str(i) + ".self_attn.out_proj.weight"]
output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.final_linear.bias"] = \
input_model["vision_model.encoder.layers." + str(i) + ".self_attn.out_proj.bias"]

output_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_1.gamma"] = \
input_model["vision_model.encoder.layers." + str(i) + ".layer_norm1.weight"]
output_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_1.beta"] = \
input_model["vision_model.encoder.layers." + str(i) + ".layer_norm1.bias"]

output_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_1.weight"] = \
input_model["vision_model.encoder.layers." + str(i) + ".mlp.fc1.weight"]
output_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_1.bias"] = \
input_model["vision_model.encoder.layers." + str(i) + ".mlp.fc1.bias"]
output_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_2.weight"] = \
input_model["vision_model.encoder.layers." + str(i) + ".mlp.fc2.weight"]
output_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_2.bias"] = \
input_model["vision_model.encoder.layers." + str(i) + ".mlp.fc2.bias"]

output_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_2.gamma"] = \
input_model["vision_model.encoder.layers." + str(i) + ".layer_norm2.weight"]
output_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_2.beta"] = \
input_model["vision_model.encoder.layers." + str(i) + ".layer_norm2.bias"]

def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--input_model_path", type=str, default="models/input_model.bin",
help=".")
parser.add_argument("--output_model_path", type=str, default="models/output_model.bin",
help=".")
parser.add_argument("--layers_num", type=int, default=12, help=".")

args = parser.parse_args()

input_model = torch.load(args.input_model_path, map_location="cpu")

output_model = collections.OrderedDict()

output_model["embedding.dual.embedding_0.word.embedding.weight"] = input_model["text_model.embeddings.token_embedding.weight"]
output_model["embedding.dual.embedding_0.pos.embedding.weight"] = input_model["text_model.embeddings.position_embedding.weight"]
output_model["embedding.dual.embedding_1.patch.cls_emb"] = input_model["vision_model.embeddings.class_embedding"].unsqueeze(0).unsqueeze(0)
output_model["embedding.dual.embedding_1.patch.projection.weight"] = input_model["vision_model.embeddings.patch_embedding.weight"]
output_model["embedding.dual.embedding_1.pos.embedding.weight"] = input_model["vision_model.embeddings.position_embedding.weight"]

output_model["embedding.dual.stream_1_layer_norm.gamma"] = input_model["vision_model.pre_layrnorm.weight"]
output_model["embedding.dual.stream_1_layer_norm.beta"] = input_model["vision_model.pre_layrnorm.bias"]

convert_clip_transformer(input_model, output_model, args.layers_num)

output_model["encoder.encoder_0.layer_norm.gamma"] = input_model["text_model.final_layer_norm.weight"]
output_model["encoder.encoder_0.layer_norm.beta"] = input_model["text_model.final_layer_norm.bias"]
output_model["encoder.encoder_1.layer_norm.gamma"] = input_model["vision_model.post_layernorm.weight"]
output_model["encoder.encoder_1.layer_norm.beta"] = input_model["vision_model.post_layernorm.bias"]
output_model["target.clr.logit_scale"] = input_model["logit_scale"]
output_model["target.clr.encoder_0_projection"] = input_model["text_projection.weight"].T
output_model["target.clr.encoder_1_projection"] = input_model["visual_projection.weight"].T

torch.save(output_model, args.output_model_path)


if __name__ == "__main__":
main()
125 changes: 125 additions & 0 deletions scripts/convert_clip_from_tencentpretrain_to_huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import argparse
import collections
import torch


def convert_clip_transformer(input_model, output_model, layers_num):

for i in range(12):
output_model["text_model.encoder.layers." + str(i) + ".self_attn.q_proj.weight"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.0.weight"]
output_model["text_model.encoder.layers." + str(i) + ".self_attn.q_proj.bias"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.0.bias"]

output_model["text_model.encoder.layers." + str(i) + ".self_attn.k_proj.weight"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.1.weight"]
output_model["text_model.encoder.layers." + str(i) + ".self_attn.k_proj.bias"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.1.bias"]

output_model["text_model.encoder.layers." + str(i) + ".self_attn.v_proj.weight"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.2.weight"]
output_model["text_model.encoder.layers." + str(i) + ".self_attn.v_proj.bias"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.2.bias"]

output_model["text_model.encoder.layers." + str(i) + ".self_attn.out_proj.weight"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.final_linear.weight"]
output_model["text_model.encoder.layers." + str(i) + ".self_attn.out_proj.bias"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.final_linear.bias"]

output_model["text_model.encoder.layers." + str(i) + ".layer_norm1.weight"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_1.gamma"]
output_model["text_model.encoder.layers." + str(i) + ".layer_norm1.bias"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_1.beta"]

output_model["text_model.encoder.layers." + str(i) + ".mlp.fc1.weight"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_1.weight"]
output_model["text_model.encoder.layers." + str(i) + ".mlp.fc1.bias"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_1.bias"]
output_model["text_model.encoder.layers." + str(i) + ".mlp.fc2.weight"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_2.weight"]
output_model["text_model.encoder.layers." + str(i) + ".mlp.fc2.bias"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_2.bias"]

output_model["text_model.encoder.layers." + str(i) + ".layer_norm2.weight"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_2.gamma"]
output_model["text_model.encoder.layers." + str(i) + ".layer_norm2.bias"] = \
input_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_2.beta"]

for i in range(12):
output_model["vision_model.encoder.layers." + str(i) + ".self_attn.q_proj.weight"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.0.weight"]
output_model["vision_model.encoder.layers." + str(i) + ".self_attn.q_proj.bias"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.0.bias"]

output_model["vision_model.encoder.layers." + str(i) + ".self_attn.k_proj.weight"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.1.weight"]
output_model["vision_model.encoder.layers." + str(i) + ".self_attn.k_proj.bias"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.1.bias"]

output_model["vision_model.encoder.layers." + str(i) + ".self_attn.v_proj.weight"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.2.weight"]
output_model["vision_model.encoder.layers." + str(i) + ".self_attn.v_proj.bias"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.2.bias"]

output_model["vision_model.encoder.layers." + str(i) + ".self_attn.out_proj.weight"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.final_linear.weight"]
output_model["vision_model.encoder.layers." + str(i) + ".self_attn.out_proj.bias"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.final_linear.bias"]

output_model["vision_model.encoder.layers." + str(i) + ".layer_norm1.weight"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_1.gamma"]
output_model["vision_model.encoder.layers." + str(i) + ".layer_norm1.bias"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_1.beta"]

output_model["vision_model.encoder.layers." + str(i) + ".mlp.fc1.weight"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_1.weight"]
output_model["vision_model.encoder.layers." + str(i) + ".mlp.fc1.bias"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_1.bias"]
output_model["vision_model.encoder.layers." + str(i) + ".mlp.fc2.weight"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_2.weight"]
output_model["vision_model.encoder.layers." + str(i) + ".mlp.fc2.bias"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_2.bias"]

output_model["vision_model.encoder.layers." + str(i) + ".layer_norm2.weight"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_2.gamma"]
output_model["vision_model.encoder.layers." + str(i) + ".layer_norm2.bias"] = \
input_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_2.beta"]

def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--input_model_path", type=str, default="models/input_model.bin",
help=".")
parser.add_argument("--output_model_path", type=str, default="models/output_model.bin",
help=".")
parser.add_argument("--layers_num", type=int, default=12, help=".")

args = parser.parse_args()

input_model = torch.load(args.input_model_path, map_location="cpu")

output_model = collections.OrderedDict()

output_model["text_model.embeddings.token_embedding.weight"] = input_model["embedding.dual.embedding_0.word.embedding.weight"]
output_model["text_model.embeddings.position_embedding.weight"] = input_model["embedding.dual.embedding_0.pos.embedding.weight"]
output_model["vision_model.embeddings.class_embedding"] = input_model["embedding.dual.embedding_1.patch.cls_emb"].squeeze().squeeze()
output_model["vision_model.embeddings.patch_embedding.weight"] = input_model["embedding.dual.embedding_1.patch.projection.weight"]
output_model["vision_model.embeddings.position_embedding.weight"] = input_model["embedding.dual.embedding_1.pos.embedding.weight"]

output_model["vision_model.pre_layrnorm.weight"] = input_model["embedding.dual.stream_1_layer_norm.gamma"]
output_model["vision_model.pre_layrnorm.bias"] = input_model["embedding.dual.stream_1_layer_norm.beta"]

convert_clip_transformer(input_model, output_model, args.layers_num)

output_model["text_model.final_layer_norm.weight"] = input_model["encoder.encoder_0.layer_norm.gamma"]
output_model["text_model.final_layer_norm.bias"] = input_model["encoder.encoder_0.layer_norm.beta"]
output_model["vision_model.post_layernorm.weight"] = input_model["encoder.encoder_1.layer_norm.gamma"]
output_model["vision_model.post_layernorm.bias"] = input_model["encoder.encoder_1.layer_norm.beta"]
output_model["logit_scale"] = input_model["target.clr.logit_scale"]
output_model["text_projection.weight"] = input_model["target.clr.encoder_0_projection"].T
output_model["visual_projection.weight"] = input_model["target.clr.encoder_1_projection"].T

torch.save(output_model, args.output_model_path)


if __name__ == "__main__":
main()
Loading

0 comments on commit 294315f

Please sign in to comment.