diff --git a/models/clip/base-16_config.json b/models/clip/base-16_config.json index 87f3999b..30a190b6 100644 --- a/models/clip/base-16_config.json +++ b/models/clip/base-16_config.json @@ -6,13 +6,13 @@ "hidden_act": "gelu", "heads_num": 8, "layers_num": 12, - "max_seq_length": 512, + "max_seq_length": 77, "embedding": ["word", "pos"], "encoder": "transformer", - "mask": "fully_visible", - "remove_embedding_layernorm": false, - "layernorm_positioning": "post", - "pooling": "first" + "mask": "causal", + "remove_embedding_layernorm": true, + "layernorm_positioning": "pre", + "pooling": "last" }, "stream_1": { @@ -26,7 +26,7 @@ "embedding": ["patch", "pos"], "encoder": "transformer", "mask": "fully_visible", - "remove_embedding_layernorm": true, + "remove_embedding_layernorm": false, "layernorm_positioning": "pre", "pooling": "first" }, diff --git a/models/clip/base-32_config.json b/models/clip/base-32_config.json index 26268e16..d1386817 100644 --- a/models/clip/base-32_config.json +++ b/models/clip/base-32_config.json @@ -6,13 +6,13 @@ "hidden_act": "gelu", "heads_num": 8, "layers_num": 12, - "max_seq_length": 512, + "max_seq_length": 77, "embedding": ["word", "pos"], "encoder": "transformer", - "mask": "fully_visible", - "remove_embedding_layernorm": false, - "layernorm_positioning": "post", - "pooling": "first" + "mask": "causal", + "remove_embedding_layernorm": true, + "layernorm_positioning": "pre", + "pooling": "last" }, "stream_1": { @@ -26,7 +26,7 @@ "embedding": ["patch", "pos"], "encoder": "transformer", "mask": "fully_visible", - "remove_embedding_layernorm": true, + "remove_embedding_layernorm": false, "layernorm_positioning": "pre", "pooling": "first" }, diff --git a/models/clip/large-14_config.json b/models/clip/large-14_config.json new file mode 100644 index 00000000..55d7f066 --- /dev/null +++ b/models/clip/large-14_config.json @@ -0,0 +1,45 @@ +{ + "stream_0": { + "emb_size": 768, + "feedforward_size": 3072, + "hidden_size": 768, + "hidden_act": "gelu", + "heads_num": 12, + "layers_num": 12, + "max_seq_length": 77, + "embedding": ["word", "pos"], + "encoder": "transformer", + "mask": "causal", + "remove_embedding_layernorm": true, + "layernorm_positioning": "pre", + "pooling": "last" + }, + + "stream_1": { + "emb_size": 1024, + "feedforward_size": 4096, + "hidden_size": 1024, + "hidden_act": "gelu_fast", + "heads_num": 16, + "layers_num": 24, + "max_seq_length": 257, + "embedding": ["patch", "pos"], + "encoder": "transformer", + "mask": "fully_visible", + "remove_embedding_layernorm": false, + "layernorm_positioning": "pre", + "pooling": "first" + }, + + "data_processor": "clip", + "embedding": ["dual"], + "encoder": "dual", + "target": ["clr"], + "image_height": 224, + "image_width": 224, + "patch_size": 14, + "feature_size": 768, + "projection": true, + "tie_weights": false, + "dropout": 0.0 +} \ No newline at end of file diff --git a/models/clip_special_tokens_map.json b/models/clip_special_tokens_map.json new file mode 100644 index 00000000..90f73081 --- /dev/null +++ b/models/clip_special_tokens_map.json @@ -0,0 +1,7 @@ +{ + "pad_token": "<|endoftext|>", + "unk_token": "<|endoftext|>", + "cls_token": "<|startoftext|>", + "sep_token": "<|endoftext|>", + "mask_token": "<|endoftext|>" +} diff --git a/scripts/convert_clip_from_huggingface_to_tencentpretrain.py b/scripts/convert_clip_from_huggingface_to_tencentpretrain.py new file mode 100644 index 00000000..6c66d1bc --- /dev/null +++ b/scripts/convert_clip_from_huggingface_to_tencentpretrain.py @@ -0,0 +1,125 @@ +import argparse +import collections +import torch + + +def convert_clip_transformer(input_model, output_model, layers_num): + + for i in range(layers_num): + output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.0.weight"] = \ + input_model["text_model.encoder.layers." + str(i) + ".self_attn.q_proj.weight"] + output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.0.bias"] = \ + input_model["text_model.encoder.layers." + str(i) + ".self_attn.q_proj.bias"] + + output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.1.weight"] = \ + input_model["text_model.encoder.layers." + str(i) + ".self_attn.k_proj.weight"] + output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.1.bias"] = \ + input_model["text_model.encoder.layers." + str(i) + ".self_attn.k_proj.bias"] + + output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.2.weight"] = \ + input_model["text_model.encoder.layers." + str(i) + ".self_attn.v_proj.weight"] + output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.2.bias"] = \ + input_model["text_model.encoder.layers." + str(i) + ".self_attn.v_proj.bias"] + + output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.final_linear.weight"] = \ + input_model["text_model.encoder.layers." + str(i) + ".self_attn.out_proj.weight"] + output_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.final_linear.bias"] = \ + input_model["text_model.encoder.layers." + str(i) + ".self_attn.out_proj.bias"] + + output_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_1.gamma"] = \ + input_model["text_model.encoder.layers." + str(i) + ".layer_norm1.weight"] + output_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_1.beta"] = \ + input_model["text_model.encoder.layers." + str(i) + ".layer_norm1.bias"] + + output_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_1.weight"] = \ + input_model["text_model.encoder.layers." + str(i) + ".mlp.fc1.weight"] + output_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_1.bias"] = \ + input_model["text_model.encoder.layers." + str(i) + ".mlp.fc1.bias"] + output_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_2.weight"] = \ + input_model["text_model.encoder.layers." + str(i) + ".mlp.fc2.weight"] + output_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_2.bias"] = \ + input_model["text_model.encoder.layers." + str(i) + ".mlp.fc2.bias"] + + output_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_2.gamma"] = \ + input_model["text_model.encoder.layers." + str(i) + ".layer_norm2.weight"] + output_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_2.beta"] = \ + input_model["text_model.encoder.layers." + str(i) + ".layer_norm2.bias"] + + + output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.0.weight"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".self_attn.q_proj.weight"] + output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.0.bias"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".self_attn.q_proj.bias"] + + output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.1.weight"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".self_attn.k_proj.weight"] + output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.1.bias"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".self_attn.k_proj.bias"] + + output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.2.weight"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".self_attn.v_proj.weight"] + output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.2.bias"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".self_attn.v_proj.bias"] + + output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.final_linear.weight"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".self_attn.out_proj.weight"] + output_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.final_linear.bias"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".self_attn.out_proj.bias"] + + output_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_1.gamma"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".layer_norm1.weight"] + output_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_1.beta"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".layer_norm1.bias"] + + output_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_1.weight"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".mlp.fc1.weight"] + output_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_1.bias"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".mlp.fc1.bias"] + output_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_2.weight"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".mlp.fc2.weight"] + output_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_2.bias"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".mlp.fc2.bias"] + + output_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_2.gamma"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".layer_norm2.weight"] + output_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_2.beta"] = \ + input_model["vision_model.encoder.layers." + str(i) + ".layer_norm2.bias"] + +def main(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--input_model_path", type=str, default="models/input_model.bin", + help=".") + parser.add_argument("--output_model_path", type=str, default="models/output_model.bin", + help=".") + parser.add_argument("--layers_num", type=int, default=12, help=".") + + args = parser.parse_args() + + input_model = torch.load(args.input_model_path, map_location="cpu") + + output_model = collections.OrderedDict() + + output_model["embedding.dual.embedding_0.word.embedding.weight"] = input_model["text_model.embeddings.token_embedding.weight"] + output_model["embedding.dual.embedding_0.pos.embedding.weight"] = input_model["text_model.embeddings.position_embedding.weight"] + output_model["embedding.dual.embedding_1.patch.cls_emb"] = input_model["vision_model.embeddings.class_embedding"].unsqueeze(0).unsqueeze(0) + output_model["embedding.dual.embedding_1.patch.projection.weight"] = input_model["vision_model.embeddings.patch_embedding.weight"] + output_model["embedding.dual.embedding_1.pos.embedding.weight"] = input_model["vision_model.embeddings.position_embedding.weight"] + + output_model["embedding.dual.stream_1_layer_norm.gamma"] = input_model["vision_model.pre_layrnorm.weight"] + output_model["embedding.dual.stream_1_layer_norm.beta"] = input_model["vision_model.pre_layrnorm.bias"] + + convert_clip_transformer(input_model, output_model, args.layers_num) + + output_model["encoder.encoder_0.layer_norm.gamma"] = input_model["text_model.final_layer_norm.weight"] + output_model["encoder.encoder_0.layer_norm.beta"] = input_model["text_model.final_layer_norm.bias"] + output_model["encoder.encoder_1.layer_norm.gamma"] = input_model["vision_model.post_layernorm.weight"] + output_model["encoder.encoder_1.layer_norm.beta"] = input_model["vision_model.post_layernorm.bias"] + output_model["target.clr.logit_scale"] = input_model["logit_scale"] + output_model["target.clr.encoder_0_projection"] = input_model["text_projection.weight"].T + output_model["target.clr.encoder_1_projection"] = input_model["visual_projection.weight"].T + + torch.save(output_model, args.output_model_path) + + +if __name__ == "__main__": + main() diff --git a/scripts/convert_clip_from_tencentpretrain_to_huggingface.py b/scripts/convert_clip_from_tencentpretrain_to_huggingface.py new file mode 100644 index 00000000..adde5602 --- /dev/null +++ b/scripts/convert_clip_from_tencentpretrain_to_huggingface.py @@ -0,0 +1,125 @@ +import argparse +import collections +import torch + + +def convert_clip_transformer(input_model, output_model, layers_num): + + for i in range(12): + output_model["text_model.encoder.layers." + str(i) + ".self_attn.q_proj.weight"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.0.weight"] + output_model["text_model.encoder.layers." + str(i) + ".self_attn.q_proj.bias"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.0.bias"] + + output_model["text_model.encoder.layers." + str(i) + ".self_attn.k_proj.weight"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.1.weight"] + output_model["text_model.encoder.layers." + str(i) + ".self_attn.k_proj.bias"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.1.bias"] + + output_model["text_model.encoder.layers." + str(i) + ".self_attn.v_proj.weight"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.2.weight"] + output_model["text_model.encoder.layers." + str(i) + ".self_attn.v_proj.bias"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.linear_layers.2.bias"] + + output_model["text_model.encoder.layers." + str(i) + ".self_attn.out_proj.weight"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.final_linear.weight"] + output_model["text_model.encoder.layers." + str(i) + ".self_attn.out_proj.bias"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".self_attn.final_linear.bias"] + + output_model["text_model.encoder.layers." + str(i) + ".layer_norm1.weight"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_1.gamma"] + output_model["text_model.encoder.layers." + str(i) + ".layer_norm1.bias"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_1.beta"] + + output_model["text_model.encoder.layers." + str(i) + ".mlp.fc1.weight"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_1.weight"] + output_model["text_model.encoder.layers." + str(i) + ".mlp.fc1.bias"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_1.bias"] + output_model["text_model.encoder.layers." + str(i) + ".mlp.fc2.weight"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_2.weight"] + output_model["text_model.encoder.layers." + str(i) + ".mlp.fc2.bias"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".feed_forward.linear_2.bias"] + + output_model["text_model.encoder.layers." + str(i) + ".layer_norm2.weight"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_2.gamma"] + output_model["text_model.encoder.layers." + str(i) + ".layer_norm2.bias"] = \ + input_model["encoder.encoder_0.transformer." + str(i) + ".layer_norm_2.beta"] + + for i in range(12): + output_model["vision_model.encoder.layers." + str(i) + ".self_attn.q_proj.weight"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.0.weight"] + output_model["vision_model.encoder.layers." + str(i) + ".self_attn.q_proj.bias"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.0.bias"] + + output_model["vision_model.encoder.layers." + str(i) + ".self_attn.k_proj.weight"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.1.weight"] + output_model["vision_model.encoder.layers." + str(i) + ".self_attn.k_proj.bias"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.1.bias"] + + output_model["vision_model.encoder.layers." + str(i) + ".self_attn.v_proj.weight"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.2.weight"] + output_model["vision_model.encoder.layers." + str(i) + ".self_attn.v_proj.bias"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.linear_layers.2.bias"] + + output_model["vision_model.encoder.layers." + str(i) + ".self_attn.out_proj.weight"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.final_linear.weight"] + output_model["vision_model.encoder.layers." + str(i) + ".self_attn.out_proj.bias"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".self_attn.final_linear.bias"] + + output_model["vision_model.encoder.layers." + str(i) + ".layer_norm1.weight"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_1.gamma"] + output_model["vision_model.encoder.layers." + str(i) + ".layer_norm1.bias"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_1.beta"] + + output_model["vision_model.encoder.layers." + str(i) + ".mlp.fc1.weight"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_1.weight"] + output_model["vision_model.encoder.layers." + str(i) + ".mlp.fc1.bias"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_1.bias"] + output_model["vision_model.encoder.layers." + str(i) + ".mlp.fc2.weight"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_2.weight"] + output_model["vision_model.encoder.layers." + str(i) + ".mlp.fc2.bias"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".feed_forward.linear_2.bias"] + + output_model["vision_model.encoder.layers." + str(i) + ".layer_norm2.weight"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_2.gamma"] + output_model["vision_model.encoder.layers." + str(i) + ".layer_norm2.bias"] = \ + input_model["encoder.encoder_1.transformer." + str(i) + ".layer_norm_2.beta"] + +def main(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--input_model_path", type=str, default="models/input_model.bin", + help=".") + parser.add_argument("--output_model_path", type=str, default="models/output_model.bin", + help=".") + parser.add_argument("--layers_num", type=int, default=12, help=".") + + args = parser.parse_args() + + input_model = torch.load(args.input_model_path, map_location="cpu") + + output_model = collections.OrderedDict() + + output_model["text_model.embeddings.token_embedding.weight"] = input_model["embedding.dual.embedding_0.word.embedding.weight"] + output_model["text_model.embeddings.position_embedding.weight"] = input_model["embedding.dual.embedding_0.pos.embedding.weight"] + output_model["vision_model.embeddings.class_embedding"] = input_model["embedding.dual.embedding_1.patch.cls_emb"].squeeze().squeeze() + output_model["vision_model.embeddings.patch_embedding.weight"] = input_model["embedding.dual.embedding_1.patch.projection.weight"] + output_model["vision_model.embeddings.position_embedding.weight"] = input_model["embedding.dual.embedding_1.pos.embedding.weight"] + + output_model["vision_model.pre_layrnorm.weight"] = input_model["embedding.dual.stream_1_layer_norm.gamma"] + output_model["vision_model.pre_layrnorm.bias"] = input_model["embedding.dual.stream_1_layer_norm.beta"] + + convert_clip_transformer(input_model, output_model, args.layers_num) + + output_model["text_model.final_layer_norm.weight"] = input_model["encoder.encoder_0.layer_norm.gamma"] + output_model["text_model.final_layer_norm.bias"] = input_model["encoder.encoder_0.layer_norm.beta"] + output_model["vision_model.post_layernorm.weight"] = input_model["encoder.encoder_1.layer_norm.gamma"] + output_model["vision_model.post_layernorm.bias"] = input_model["encoder.encoder_1.layer_norm.beta"] + output_model["logit_scale"] = input_model["target.clr.logit_scale"] + output_model["text_projection.weight"] = input_model["target.clr.encoder_0_projection"].T + output_model["visual_projection.weight"] = input_model["target.clr.encoder_1_projection"].T + + torch.save(output_model, args.output_model_path) + + +if __name__ == "__main__": + main() diff --git a/tencentpretrain/opts.py b/tencentpretrain/opts.py index 57b03cde..b7c22d79 100755 --- a/tencentpretrain/opts.py +++ b/tencentpretrain/opts.py @@ -176,7 +176,7 @@ def infer_opts(parser): def tokenizer_opts(parser): parser.add_argument("--tokenizer", choices=["bert", "bpe", "char", "space", "xlmroberta", "image", "text_image", - "virtual", "hfpretrained"], default="bert", + "virtual", "hfpretrained", "clip"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer." "Char tokenizer segments sentences into characters." diff --git a/tencentpretrain/utils/__init__.py b/tencentpretrain/utils/__init__.py index efa947e4..d1bcb60c 100644 --- a/tencentpretrain/utils/__init__.py +++ b/tencentpretrain/utils/__init__.py @@ -7,7 +7,8 @@ str2tokenizer = {"char": CharTokenizer, "space": SpaceTokenizer, "bert": BertTokenizer, "bpe": BPETokenizer, "xlmroberta": XLMRobertaTokenizer, "image": ImageTokenizer, - "text_image": TextImageTokenizer, "virtual": VirtualTokenizer, "hfpretrained": HFPreTrainedTokenizer} + "text_image": TextImageTokenizer, "virtual": VirtualTokenizer, "hfpretrained": HFPreTrainedTokenizer, + "clip": ClipTokenizer} str2dataset = {"bert": BertDataset, "lm": LmDataset, "mlm": MlmDataset, "bilm": BilmDataset, "albert": AlbertDataset, "mt": MtDataset, "t5": T5Dataset, "gsg": GsgDataset, "bart": BartDataset, @@ -34,7 +35,8 @@ str2adv = {"fgm": FGM, "pgd": PGD} __all__ = ["CharTokenizer", "SpaceTokenizer", "BertTokenizer", "BPETokenizer", "XLMRobertaTokenizer", - "ImageTokenizer", "TextImageTokenizer", "str2tokenizer", + "ImageTokenizer", "TextImageTokenizer", "VirtualTokenizer", "HFPreTrainedTokenizer", + "ClipTokenizer", "str2tokenizer", "BertDataset", "LmDataset", "MlmDataset", "BilmDataset", "AlbertDataset", "MtDataset", "T5Dataset", "GsgDataset", "BartDataset", "ClsDataset", "PrefixlmDataset", "ClsMlmDataset", diff --git a/tencentpretrain/utils/tokenizers.py b/tencentpretrain/utils/tokenizers.py index 6ee3512a..d196c8bf 100644 --- a/tencentpretrain/utils/tokenizers.py +++ b/tencentpretrain/utils/tokenizers.py @@ -622,3 +622,111 @@ def convert_ids_to_tokens(self, ids): def decode(self, ids): return self.tokenizer.decode(ids) + +def whitespace_clean(text): + text = re.sub(r"\s+", " ", text) + text = text.strip() + return text + +class ClipTokenizer(Tokenizer): + """Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.""" + def __init__(self, args, is_src=True, unk_token="<|endoftext|>"): + self.vocab_file = args.vocab_path + self.merges_file = args.merges_path + self.unk_token = unk_token + + try: + import ftfy + + self.fix_text = ftfy.fix_text + except ImportError: + self.fix_text = None + import json + with open(self.vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.vocab = self.encoder + self.decoder = {v: k for k, v in self.encoder.items()} + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + with open(self.merges_file, encoding="utf-8") as merges_handle: + bpe_merges = merges_handle.read().strip().split("\n")[1 : 49152 - 256 - 2 + 1] + bpe_merges = [tuple(merge.split()) for merge in bpe_merges] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"} + + self.pat = re.compile( + r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", + re.IGNORECASE, + ) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token[:-1]) + (token[-1] + "",) + pairs = get_pairs(word) + + if not pairs: + return token + "" + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + if self.fix_text is None: + text = " ".join(self.nlp.tokenize(text)) + else: + text = whitespace_clean(self.fix_text(text)).lower() + + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + def convert_tokens_to_ids(self, tokens): + """Converts a token (str) in an id using the vocab.""" + return [self.encoder.get(token, self.encoder.get(self.unk_token)) for token in tokens] + + def convert_ids_to_tokens(self, ids): + """Converts an index (integer) in a token (str) using the vocab.""" + return [self.decoder.get(index) for index in ids] + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + byte_array = bytearray([self.byte_decoder[c] for c in text]) + text = byte_array.decode("utf-8", errors=self.errors).replace("", " ").strip() + return text