openai/
ones on the hub. I can't merge as I'm not the owner, that's \n\n@mitchellw\n\t \n","updatedAt":"2023-06-23T18:09:00.417Z","author":{"_id":"604a5184dca2c7ac7508b849","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1667002643224-604a5184dca2c7ac7508b849.jpeg","fullname":"Ross Wightman","name":"rwightman","type":"user","isPro":false,"isHf":true,"isHfAdmin":false,"isMod":false,"followerCount":302}},"numEdits":0,"identifiedLanguage":{"language":"en","probability":0.9706042408943176},"editors":["rwightman"],"editorAvatarUrls":["https://cdn-avatars.huggingface.co/v1/production/uploads/1667002643224-604a5184dca2c7ac7508b849.jpeg"],"reactions":[],"isReport":false}},{"id":"65a6f601c2cf664a51917f7c","author":{"_id":"604a5184dca2c7ac7508b849","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1667002643224-604a5184dca2c7ac7508b849.jpeg","fullname":"Ross Wightman","name":"rwightman","type":"user","isPro":false,"isHf":true,"isHfAdmin":false,"isMod":false,"followerCount":302,"isOwner":false,"isOrgMember":true},"createdAt":"2024-01-16T21:32:49.000Z","type":"comment","data":{"edited":false,"hidden":false,"latest":{"raw":"@patrickvonplaten so I have write access and can merge this now, is this still a desired change making it match original tokenizer or think people are relying on this behaviour?","html":"\n\n@patrickvonplaten\n\t so I have write access and can merge this now, is this still a desired change making it match original tokenizer or think people are relying on this behaviour?
\n","updatedAt":"2024-01-16T21:32:49.588Z","author":{"_id":"604a5184dca2c7ac7508b849","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1667002643224-604a5184dca2c7ac7508b849.jpeg","fullname":"Ross Wightman","name":"rwightman","type":"user","isPro":false,"isHf":true,"isHfAdmin":false,"isMod":false,"followerCount":302}},"numEdits":0,"identifiedLanguage":{"language":"en","probability":0.9218249917030334},"editors":["rwightman"],"editorAvatarUrls":["https://cdn-avatars.huggingface.co/v1/production/uploads/1667002643224-604a5184dca2c7ac7508b849.jpeg"],"reactions":[],"isReport":false}}],"pinned":false,"locked":false,"collection":"discussions","isPullRequest":true,"changes":{"base":"refs/heads/main"},"filesWithConflicts":[]},"repo":{"name":"laion/CLIP-ViT-bigG-14-laion2B-39B-b160k","type":"model"},"activeTab":"discussion","diffStats":[{"additions":24,"deletions":1,"path":"special_tokens_map.json"},{"additions":33,"deletions":33,"path":"tokenizer_config.json"}],"discussionRole":0,"watched":false,"muted":false,"repoDiscussionsLocked":false}">Correrct Transformers Pad Token
openai/
ones on the hub. I can't merge as I'm not the owner, that's \n\n@mitchellw\n\t \n","updatedAt":"2023-06-23T18:09:00.417Z","author":{"_id":"604a5184dca2c7ac7508b849","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1667002643224-604a5184dca2c7ac7508b849.jpeg","fullname":"Ross Wightman","name":"rwightman","type":"user","isPro":false,"isHf":true,"isHfAdmin":false,"isMod":false,"followerCount":302}},"numEdits":0,"identifiedLanguage":{"language":"en","probability":0.9706042408943176},"editors":["rwightman"],"editorAvatarUrls":["https://cdn-avatars.huggingface.co/v1/production/uploads/1667002643224-604a5184dca2c7ac7508b849.jpeg"],"reactions":[],"isReport":false}},{"id":"65a6f601c2cf664a51917f7c","author":{"_id":"604a5184dca2c7ac7508b849","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1667002643224-604a5184dca2c7ac7508b849.jpeg","fullname":"Ross Wightman","name":"rwightman","type":"user","isPro":false,"isHf":true,"isHfAdmin":false,"isMod":false,"followerCount":302,"isOwner":false,"isOrgMember":true},"createdAt":"2024-01-16T21:32:49.000Z","type":"comment","data":{"edited":false,"hidden":false,"latest":{"raw":"@patrickvonplaten so I have write access and can merge this now, is this still a desired change making it match original tokenizer or think people are relying on this behaviour?","html":"\n\n@patrickvonplaten\n\t so I have write access and can merge this now, is this still a desired change making it match original tokenizer or think people are relying on this behaviour?
\n","updatedAt":"2024-01-16T21:32:49.588Z","author":{"_id":"604a5184dca2c7ac7508b849","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1667002643224-604a5184dca2c7ac7508b849.jpeg","fullname":"Ross Wightman","name":"rwightman","type":"user","isPro":false,"isHf":true,"isHfAdmin":false,"isMod":false,"followerCount":302}},"numEdits":0,"identifiedLanguage":{"language":"en","probability":0.9218249917030334},"editors":["rwightman"],"editorAvatarUrls":["https://cdn-avatars.huggingface.co/v1/production/uploads/1667002643224-604a5184dca2c7ac7508b849.jpeg"],"reactions":[],"isReport":false}}],"pinned":false,"locked":false,"collection":"discussions","isPullRequest":true,"changes":{"base":"refs/heads/main"},"filesWithConflicts":[]},"primaryEmailConfirmed":false,"repo":{"name":"laion/CLIP-ViT-bigG-14-laion2B-39B-b160k","type":"model"},"discussionRole":0,"acceptLanguages":["*"],"hideComments":true,"repoDiscussionsLocked":false,"isDiscussionAuthor":false}">import open_clip
tokenizer = open_clip.get_tokenizer('ViT-bigG-14')
print(tokenizer("hello"))
gives:
tensor([[49406, 3306, 49407, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0]])
which means the padding token should be 0
not 49407
.
This PR corrects the Hugging Face Transformers version so that it matches the open_clip tokenizer:
from transformers import CLIPTokenizer
tokenizer = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
print(tokenizer("hello", max_length=77, padding="max_length", truncation=True))
@patrickvonplaten
@julien-c
it is indeed wrong, but as mentioned in slack, this probably means all HF Transformers based tokenizers for OpenCLIP AND probably the OpenAI originals are wrong as OpenCLIP Transformers tokenizer config was just copied from the openai/
ones on the hub. I can't merge as I'm not the owner, that's
@mitchellw
@patrickvonplaten so I have write access and can merge this now, is this still a desired change making it match original tokenizer or think people are relying on this behaviour?