import{s as oo,o as no,n as we}from"../chunks/scheduler.18a86fab.js";import{S as so,i as ao,g as m,s as a,r as f,A as ro,h as p,f as s,c as r,j as Z,x as u,u as h,k as U,y as i,a as l,v as g,d as _,t as b,w as y}from"../chunks/index.98837b22.js";import{T as Dt}from"../chunks/Tip.77304350.js";import{D as Q}from"../chunks/Docstring.a1ef7999.js";import{C as rt}from"../chunks/CodeBlock.8d0c2e8a.js";import{E as at}from"../chunks/ExampleCodeBlock.8c3ee1f9.js";import{H as P,E as lo}from"../chunks/getInferenceSnippets.06c2775f.js";function io(M){let t,w="Example:",c,d,v;return d=new rt({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFNFV0RDb25maWclMkMlMjBTRVdETW9kZWwlMEElMEElMjMlMjBJbml0aWFsaXppbmclMjBhJTIwU0VXLUQlMjBhc2FwcCUyRnNldy1kLXRpbnktMTAwayUyMHN0eWxlJTIwY29uZmlndXJhdGlvbiUwQWNvbmZpZ3VyYXRpb24lMjAlM0QlMjBTRVdEQ29uZmlnKCklMEElMEElMjMlMjBJbml0aWFsaXppbmclMjBhJTIwbW9kZWwlMjAod2l0aCUyMHJhbmRvbSUyMHdlaWdodHMpJTIwZnJvbSUyMHRoZSUyMGFzYXBwJTJGc2V3LWQtdGlueS0xMDBrJTIwc3R5bGUlMjBjb25maWd1cmF0aW9uJTBBbW9kZWwlMjAlM0QlMjBTRVdETW9kZWwoY29uZmlndXJhdGlvbiklMEElMEElMjMlMjBBY2Nlc3NpbmclMjB0aGUlMjBtb2RlbCUyMGNvbmZpZ3VyYXRpb24lMEFjb25maWd1cmF0aW9uJTIwJTNEJTIwbW9kZWwuY29uZmln",highlighted:`from transformers import SEWDConfig, SEWDModel
# Initializing a SEW-D asapp/sew-d-tiny-100k style configuration
configuration = SEWDConfig()
# Initializing a model (with random weights) from the asapp/sew-d-tiny-100k style configuration
model = SEWDModel(configuration)
# Accessing the model configuration
configuration = model.config`,wrap:!1}}),{c(){t=m("p"),t.textContent=w,c=a(),f(d.$$.fragment)},l(n){t=p(n,"P",{"data-svelte-h":!0}),u(t)!=="svelte-11lpom8"&&(t.textContent=w),c=r(n),h(d.$$.fragment,n)},m(n,T){l(n,t,T),l(n,c,T),g(d,n,T),v=!0},p:we,i(n){v||(_(d.$$.fragment,n),v=!0)},o(n){b(d.$$.fragment,n),v=!1},d(n){n&&(s(t),s(c)),y(d,n)}}}function co(M){let t,w=`Although the recipe for forward pass needs to be defined within this function, one should call the Module
instance afterwards instead of this since the former takes care of running the pre and post processing steps while
the latter silently ignores them.`;return{c(){t=m("p"),t.innerHTML=w},l(c){t=p(c,"P",{"data-svelte-h":!0}),u(t)!=="svelte-fincs2"&&(t.innerHTML=w)},m(c,d){l(c,t,d)},p:we,d(c){c&&s(t)}}}function mo(M){let t,w=`Although the recipe for forward pass needs to be defined within this function, one should call the Module
instance afterwards instead of this since the former takes care of running the pre and post processing steps while
the latter silently ignores them.`;return{c(){t=m("p"),t.innerHTML=w},l(c){t=p(c,"P",{"data-svelte-h":!0}),u(t)!=="svelte-fincs2"&&(t.innerHTML=w)},m(c,d){l(c,t,d)},p:we,d(c){c&&s(t)}}}function po(M){let t,w="Example:",c,d,v;return d=new rt({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Qcm9jZXNzb3IlMkMlMjBTRVdERm9yQ1RDJTBBZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBaW1wb3J0JTIwdG9yY2glMEElMEFkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMmhmLWludGVybmFsLXRlc3RpbmclMkZsaWJyaXNwZWVjaF9hc3JfZGVtbyUyMiUyQyUyMCUyMmNsZWFuJTIyJTJDJTIwc3BsaXQlM0QlMjJ2YWxpZGF0aW9uJTIyKSUwQWRhdGFzZXQlMjAlM0QlMjBkYXRhc2V0LnNvcnQoJTIyaWQlMjIpJTBBc2FtcGxpbmdfcmF0ZSUyMCUzRCUyMGRhdGFzZXQuZmVhdHVyZXMlNUIlMjJhdWRpbyUyMiU1RC5zYW1wbGluZ19yYXRlJTBBJTBBcHJvY2Vzc29yJTIwJTNEJTIwQXV0b1Byb2Nlc3Nvci5mcm9tX3ByZXRyYWluZWQoJTIyYXNhcHAlMkZzZXctZC10aW55LTEwMGslMjIpJTBBbW9kZWwlMjAlM0QlMjBTRVdERm9yQ1RDLmZyb21fcHJldHJhaW5lZCglMjJhc2FwcCUyRnNldy1kLXRpbnktMTAwayUyMiklMEElMEElMjMlMjBhdWRpbyUyMGZpbGUlMjBpcyUyMGRlY29kZWQlMjBvbiUyMHRoZSUyMGZseSUwQWlucHV0cyUyMCUzRCUyMHByb2Nlc3NvcihkYXRhc2V0JTVCMCU1RCU1QiUyMmF1ZGlvJTIyJTVEJTVCJTIyYXJyYXklMjIlNUQlMkMlMjBzYW1wbGluZ19yYXRlJTNEc2FtcGxpbmdfcmF0ZSUyQyUyMHJldHVybl90ZW5zb3JzJTNEJTIycHQlMjIpJTBBd2l0aCUyMHRvcmNoLm5vX2dyYWQoKSUzQSUwQSUyMCUyMCUyMCUyMGxvZ2l0cyUyMCUzRCUyMG1vZGVsKCoqaW5wdXRzKS5sb2dpdHMlMEFwcmVkaWN0ZWRfaWRzJTIwJTNEJTIwdG9yY2guYXJnbWF4KGxvZ2l0cyUyQyUyMGRpbSUzRC0xKSUwQSUwQSUyMyUyMHRyYW5zY3JpYmUlMjBzcGVlY2glMEF0cmFuc2NyaXB0aW9uJTIwJTNEJTIwcHJvY2Vzc29yLmJhdGNoX2RlY29kZShwcmVkaWN0ZWRfaWRzKSUwQXRyYW5zY3JpcHRpb24lNUIwJTVEJTBBJTBBaW5wdXRzJTVCJTIybGFiZWxzJTIyJTVEJTIwJTNEJTIwcHJvY2Vzc29yKHRleHQlM0RkYXRhc2V0JTVCMCU1RCU1QiUyMnRleHQlMjIlNUQlMkMlMjByZXR1cm5fdGVuc29ycyUzRCUyMnB0JTIyKS5pbnB1dF9pZHMlMEElMEElMjMlMjBjb21wdXRlJTIwbG9zcyUwQWxvc3MlMjAlM0QlMjBtb2RlbCgqKmlucHV0cykubG9zcyUwQXJvdW5kKGxvc3MuaXRlbSgpJTJDJTIwMik=",highlighted:`from transformers import AutoProcessor, SEWDForCTC
from datasets import load_dataset
import torch
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate
processor = AutoProcessor.from_pretrained("asapp/sew-d-tiny-100k")
model = SEWDForCTC.from_pretrained("asapp/sew-d-tiny-100k")
# audio file is decoded on the fly
inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
# transcribe speech
transcription = processor.batch_decode(predicted_ids)
transcription[0]
...
inputs["labels"] = processor(text=dataset[0]["text"], return_tensors="pt").input_ids
# compute loss
loss = model(**inputs).loss
round(loss.item(), 2)
...`,wrap:!1}}),{c(){t=m("p"),t.textContent=w,c=a(),f(d.$$.fragment)},l(n){t=p(n,"P",{"data-svelte-h":!0}),u(t)!=="svelte-11lpom8"&&(t.textContent=w),c=r(n),h(d.$$.fragment,n)},m(n,T){l(n,t,T),l(n,c,T),g(d,n,T),v=!0},p:we,i(n){v||(_(d.$$.fragment,n),v=!0)},o(n){b(d.$$.fragment,n),v=!1},d(n){n&&(s(t),s(c)),y(d,n)}}}function uo(M){let t,w=`Although the recipe for forward pass needs to be defined within this function, one should call the Module
instance afterwards instead of this since the former takes care of running the pre and post processing steps while
the latter silently ignores them.`;return{c(){t=m("p"),t.innerHTML=w},l(c){t=p(c,"P",{"data-svelte-h":!0}),u(t)!=="svelte-fincs2"&&(t.innerHTML=w)},m(c,d){l(c,t,d)},p:we,d(c){c&&s(t)}}}function fo(M){let t,w="Example of single-label classification:",c,d,v;return d=new rt({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwdHJhbnNmb3JtZXJzJTIwaW1wb3J0JTIwQXV0b1Rva2VuaXplciUyQyUyMFNFV0RGb3JTZXF1ZW5jZUNsYXNzaWZpY2F0aW9uJTBBJTBBdG9rZW5pemVyJTIwJTNEJTIwQXV0b1Rva2VuaXplci5mcm9tX3ByZXRyYWluZWQoJTIyYXNhcHAlMkZzZXctZC10aW55LTEwMGslMjIpJTBBbW9kZWwlMjAlM0QlMjBTRVdERm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbi5mcm9tX3ByZXRyYWluZWQoJTIyYXNhcHAlMkZzZXctZC10aW55LTEwMGslMjIpJTBBJTBBaW5wdXRzJTIwJTNEJTIwdG9rZW5pemVyKCUyMkhlbGxvJTJDJTIwbXklMjBkb2clMjBpcyUyMGN1dGUlMjIlMkMlMjByZXR1cm5fdGVuc29ycyUzRCUyMnB0JTIyKSUwQSUwQXdpdGglMjB0b3JjaC5ub19ncmFkKCklM0ElMEElMjAlMjAlMjAlMjBsb2dpdHMlMjAlM0QlMjBtb2RlbCgqKmlucHV0cykubG9naXRzJTBBJTBBcHJlZGljdGVkX2NsYXNzX2lkJTIwJTNEJTIwbG9naXRzLmFyZ21heCgpLml0ZW0oKSUwQW1vZGVsLmNvbmZpZy5pZDJsYWJlbCU1QnByZWRpY3RlZF9jbGFzc19pZCU1RCUwQSUwQSUyMyUyMFRvJTIwdHJhaW4lMjBhJTIwbW9kZWwlMjBvbiUyMCU2MG51bV9sYWJlbHMlNjAlMjBjbGFzc2VzJTJDJTIweW91JTIwY2FuJTIwcGFzcyUyMCU2MG51bV9sYWJlbHMlM0RudW1fbGFiZWxzJTYwJTIwdG8lMjAlNjAuZnJvbV9wcmV0cmFpbmVkKC4uLiklNjAlMEFudW1fbGFiZWxzJTIwJTNEJTIwbGVuKG1vZGVsLmNvbmZpZy5pZDJsYWJlbCklMEFtb2RlbCUyMCUzRCUyMFNFV0RGb3JTZXF1ZW5jZUNsYXNzaWZpY2F0aW9uLmZyb21fcHJldHJhaW5lZCglMjJhc2FwcCUyRnNldy1kLXRpbnktMTAwayUyMiUyQyUyMG51bV9sYWJlbHMlM0RudW1fbGFiZWxzKSUwQSUwQWxhYmVscyUyMCUzRCUyMHRvcmNoLnRlbnNvciglNUIxJTVEKSUwQWxvc3MlMjAlM0QlMjBtb2RlbCgqKmlucHV0cyUyQyUyMGxhYmVscyUzRGxhYmVscykubG9zcyUwQXJvdW5kKGxvc3MuaXRlbSgpJTJDJTIwMik=",highlighted:`import torch
from transformers import AutoTokenizer, SEWDForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("asapp/sew-d-tiny-100k")
model = SEWDForSequenceClassification.from_pretrained("asapp/sew-d-tiny-100k")
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]
...
# To train a model on \`num_labels\` classes, you can pass \`num_labels=num_labels\` to \`.from_pretrained(...)\`
num_labels = len(model.config.id2label)
model = SEWDForSequenceClassification.from_pretrained("asapp/sew-d-tiny-100k", num_labels=num_labels)
labels = torch.tensor([1])
loss = model(**inputs, labels=labels).loss
round(loss.item(), 2)
...`,wrap:!1}}),{c(){t=m("p"),t.textContent=w,c=a(),f(d.$$.fragment)},l(n){t=p(n,"P",{"data-svelte-h":!0}),u(t)!=="svelte-ykxpe4"&&(t.textContent=w),c=r(n),h(d.$$.fragment,n)},m(n,T){l(n,t,T),l(n,c,T),g(d,n,T),v=!0},p:we,i(n){v||(_(d.$$.fragment,n),v=!0)},o(n){b(d.$$.fragment,n),v=!1},d(n){n&&(s(t),s(c)),y(d,n)}}}function ho(M){let t,w="Example of multi-label classification:",c,d,v;return d=new rt({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwdHJhbnNmb3JtZXJzJTIwaW1wb3J0JTIwQXV0b1Rva2VuaXplciUyQyUyMFNFV0RGb3JTZXF1ZW5jZUNsYXNzaWZpY2F0aW9uJTBBJTBBdG9rZW5pemVyJTIwJTNEJTIwQXV0b1Rva2VuaXplci5mcm9tX3ByZXRyYWluZWQoJTIyYXNhcHAlMkZzZXctZC10aW55LTEwMGslMjIpJTBBbW9kZWwlMjAlM0QlMjBTRVdERm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbi5mcm9tX3ByZXRyYWluZWQoJTIyYXNhcHAlMkZzZXctZC10aW55LTEwMGslMjIlMkMlMjBwcm9ibGVtX3R5cGUlM0QlMjJtdWx0aV9sYWJlbF9jbGFzc2lmaWNhdGlvbiUyMiklMEElMEFpbnB1dHMlMjAlM0QlMjB0b2tlbml6ZXIoJTIySGVsbG8lMkMlMjBteSUyMGRvZyUyMGlzJTIwY3V0ZSUyMiUyQyUyMHJldHVybl90ZW5zb3JzJTNEJTIycHQlMjIpJTBBJTBBd2l0aCUyMHRvcmNoLm5vX2dyYWQoKSUzQSUwQSUyMCUyMCUyMCUyMGxvZ2l0cyUyMCUzRCUyMG1vZGVsKCoqaW5wdXRzKS5sb2dpdHMlMEElMEFwcmVkaWN0ZWRfY2xhc3NfaWRzJTIwJTNEJTIwdG9yY2guYXJhbmdlKDAlMkMlMjBsb2dpdHMuc2hhcGUlNUItMSU1RCklNUJ0b3JjaC5zaWdtb2lkKGxvZ2l0cykuc3F1ZWV6ZShkaW0lM0QwKSUyMCUzRSUyMDAuNSU1RCUwQSUwQSUyMyUyMFRvJTIwdHJhaW4lMjBhJTIwbW9kZWwlMjBvbiUyMCU2MG51bV9sYWJlbHMlNjAlMjBjbGFzc2VzJTJDJTIweW91JTIwY2FuJTIwcGFzcyUyMCU2MG51bV9sYWJlbHMlM0RudW1fbGFiZWxzJTYwJTIwdG8lMjAlNjAuZnJvbV9wcmV0cmFpbmVkKC4uLiklNjAlMEFudW1fbGFiZWxzJTIwJTNEJTIwbGVuKG1vZGVsLmNvbmZpZy5pZDJsYWJlbCklMEFtb2RlbCUyMCUzRCUyMFNFV0RGb3JTZXF1ZW5jZUNsYXNzaWZpY2F0aW9uLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJhc2FwcCUyRnNldy1kLXRpbnktMTAwayUyMiUyQyUyMG51bV9sYWJlbHMlM0RudW1fbGFiZWxzJTJDJTIwcHJvYmxlbV90eXBlJTNEJTIybXVsdGlfbGFiZWxfY2xhc3NpZmljYXRpb24lMjIlMEEpJTBBJTBBbGFiZWxzJTIwJTNEJTIwdG9yY2guc3VtKCUwQSUyMCUyMCUyMCUyMHRvcmNoLm5uLmZ1bmN0aW9uYWwub25lX2hvdChwcmVkaWN0ZWRfY2xhc3NfaWRzJTVCTm9uZSUyQyUyMCUzQSU1RC5jbG9uZSgpJTJDJTIwbnVtX2NsYXNzZXMlM0RudW1fbGFiZWxzKSUyQyUyMGRpbSUzRDElMEEpLnRvKHRvcmNoLmZsb2F0KSUwQWxvc3MlMjAlM0QlMjBtb2RlbCgqKmlucHV0cyUyQyUyMGxhYmVscyUzRGxhYmVscykubG9zcw==",highlighted:`import torch
from transformers import AutoTokenizer, SEWDForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("asapp/sew-d-tiny-100k")
model = SEWDForSequenceClassification.from_pretrained("asapp/sew-d-tiny-100k", problem_type="multi_label_classification")
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
predicted_class_ids = torch.arange(0, logits.shape[-1])[torch.sigmoid(logits).squeeze(dim=0) > 0.5]
# To train a model on \`num_labels\` classes, you can pass \`num_labels=num_labels\` to \`.from_pretrained(...)\`
num_labels = len(model.config.id2label)
model = SEWDForSequenceClassification.from_pretrained(
"asapp/sew-d-tiny-100k", num_labels=num_labels, problem_type="multi_label_classification"
)
labels = torch.sum(
torch.nn.functional.one_hot(predicted_class_ids[None, :].clone(), num_classes=num_labels), dim=1
).to(torch.float)
loss = model(**inputs, labels=labels).loss`,wrap:!1}}),{c(){t=m("p"),t.textContent=w,c=a(),f(d.$$.fragment)},l(n){t=p(n,"P",{"data-svelte-h":!0}),u(t)!=="svelte-1l8e32d"&&(t.textContent=w),c=r(n),h(d.$$.fragment,n)},m(n,T){l(n,t,T),l(n,c,T),g(d,n,T),v=!0},p:we,i(n){v||(_(d.$$.fragment,n),v=!0)},o(n){b(d.$$.fragment,n),v=!1},d(n){n&&(s(t),s(c)),y(d,n)}}}function go(M){let t,w,c,d,v,n="This model was released on 2021-09-14 and added to Hugging Face Transformers on 2021-10-15.",T,A,Ze,V,Ft='',Re,O,Ve,K,Ut=`SEW-D (Squeezed and Efficient Wav2Vec with Disentangled attention) was proposed in Performance-Efficiency Trade-offs
in Unsupervised Pre-training for Speech Recognition by Felix Wu, Kwangyoun Kim,
Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.`,Ge,ee,zt="The abstract from the paper is the following:",Ie,te,qt=`This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition
(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance
and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a
pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a
variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x
inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference
time, SEW reduces word error rate by 25-50% across different model sizes.`,Ne,oe,Zt='This model was contributed by anton-l.',Be,ne,He,se,Rt=`
__call__
special method.',_t,N,Oe,ue,Ke,k,fe,bt,Ee,Yt="SEW-D Model with a language modeling
head on top for Connectionist Temporal Classification (CTC).",yt,je,Qt=`This model inherits from PreTrainedModel. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)`,vt,xe,Pt=`This model is also a PyTorch torch.nn.Module subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.`,wt,D,he,Tt,Je,At='The SEWDForCTC forward method, overrides the __call__
special method.',Mt,B,Wt,H,et,ge,tt,$,_e,Ct,De,Ot=`SEWD Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like SUPERB
Keyword Spotting.`,kt,Fe,Kt=`This model inherits from PreTrainedModel. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)`,$t,Ue,eo=`This model is also a PyTorch torch.nn.Module subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.`,St,S,be,Et,ze,to='The SEWDForSequenceClassification forward method, overrides the __call__
special method.',jt,X,xt,L,Jt,Y,ot,ye,nt,qe,st;return A=new P({props:{title:"SEW-D",local:"sew-d",headingTag:"h1"}}),O=new P({props:{title:"Overview",local:"overview",headingTag:"h2"}}),ne=new P({props:{title:"Usage tips",local:"usage-tips",headingTag:"h2"}}),ae=new P({props:{title:"Resources",local:"resources",headingTag:"h2"}}),le=new P({props:{title:"SEWDConfig",local:"transformers.SEWDConfig",headingTag:"h2"}}),ie=new Q({props:{name:"class transformers.SEWDConfig",anchor:"transformers.SEWDConfig",parameters:[{name:"vocab_size",val:" = 32"},{name:"hidden_size",val:" = 768"},{name:"num_hidden_layers",val:" = 12"},{name:"num_attention_heads",val:" = 12"},{name:"intermediate_size",val:" = 3072"},{name:"squeeze_factor",val:" = 2"},{name:"max_position_embeddings",val:" = 512"},{name:"position_buckets",val:" = 256"},{name:"share_att_key",val:" = True"},{name:"relative_attention",val:" = True"},{name:"pos_att_type",val:" = ('p2c', 'c2p')"},{name:"norm_rel_ebd",val:" = 'layer_norm'"},{name:"hidden_act",val:" = 'gelu_python'"},{name:"hidden_dropout",val:" = 0.1"},{name:"activation_dropout",val:" = 0.1"},{name:"attention_dropout",val:" = 0.1"},{name:"feat_proj_dropout",val:" = 0.0"},{name:"final_dropout",val:" = 0.1"},{name:"initializer_range",val:" = 0.02"},{name:"layer_norm_eps",val:" = 1e-07"},{name:"feature_layer_norm_eps",val:" = 1e-05"},{name:"feat_extract_norm",val:" = 'group'"},{name:"feat_extract_activation",val:" = 'gelu'"},{name:"conv_dim",val:" = (64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)"},{name:"conv_stride",val:" = (5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)"},{name:"conv_kernel",val:" = (10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)"},{name:"conv_bias",val:" = False"},{name:"num_conv_pos_embeddings",val:" = 128"},{name:"num_conv_pos_embedding_groups",val:" = 16"},{name:"apply_spec_augment",val:" = True"},{name:"mask_time_prob",val:" = 0.05"},{name:"mask_time_length",val:" = 10"},{name:"mask_time_min_masks",val:" = 2"},{name:"mask_feature_prob",val:" = 0.0"},{name:"mask_feature_length",val:" = 10"},{name:"mask_feature_min_masks",val:" = 0"},{name:"ctc_loss_reduction",val:" = 'mean'"},{name:"ctc_zero_infinity",val:" = False"},{name:"use_weighted_layer_sum",val:" = False"},{name:"classifier_proj_size",val:" = 256"},{name:"pad_token_id",val:" = 0"},{name:"bos_token_id",val:" = 1"},{name:"eos_token_id",val:" = 2"},{name:"**kwargs",val:""}],parametersDescription:[{anchor:"transformers.SEWDConfig.vocab_size",description:`vocab_size (int
, optional, defaults to 32) —
Vocabulary size of the SEW-D model. Defines the number of different tokens that can be represented by the
inputs_ids
passed when calling SEWD
.`,name:"vocab_size"},{anchor:"transformers.SEWDConfig.hidden_size",description:`hidden_size (int
, optional, defaults to 768) —
Dimensionality of the encoder layers and the pooler layer.`,name:"hidden_size"},{anchor:"transformers.SEWDConfig.num_hidden_layers",description:`num_hidden_layers (int
, optional, defaults to 12) —
Number of hidden layers in the Transformer encoder.`,name:"num_hidden_layers"},{anchor:"transformers.SEWDConfig.num_attention_heads",description:`num_attention_heads (int
, optional, defaults to 12) —
Number of attention heads for each attention layer in the Transformer encoder.`,name:"num_attention_heads"},{anchor:"transformers.SEWDConfig.intermediate_size",description:`intermediate_size (int
, optional, defaults to 3072) —
Dimensionality of the “intermediate” (i.e., feed-forward) layer in the Transformer encoder.`,name:"intermediate_size"},{anchor:"transformers.SEWDConfig.squeeze_factor",description:`squeeze_factor (int
, optional, defaults to 2) —
Sequence length downsampling factor after the encoder and upsampling factor after the transformer.`,name:"squeeze_factor"},{anchor:"transformers.SEWDConfig.max_position_embeddings",description:`max_position_embeddings (int
, optional, defaults to 512) —
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).`,name:"max_position_embeddings"},{anchor:"transformers.SEWDConfig.position_buckets",description:`position_buckets (int
, optional, defaults to 256) —
The maximum size of relative position embeddings.`,name:"position_buckets"},{anchor:"transformers.SEWDConfig.share_att_key",description:`share_att_key (bool
, optional, defaults to True
) —
Whether to share attention key with c2p and p2c.`,name:"share_att_key"},{anchor:"transformers.SEWDConfig.relative_attention",description:`relative_attention (bool
, optional, defaults to True
) —
Whether to use relative position encoding.`,name:"relative_attention"},{anchor:"transformers.SEWDConfig.pos_att_type",description:`pos_att_type (tuple[str]
, optional, defaults to ("p2c", "c2p")
) —
The type of relative position attention, it can be a combination of ("p2c", "c2p")
, e.g. ("p2c")
,
("p2c", "c2p")
, ("p2c", "c2p")
.`,name:"pos_att_type"},{anchor:"transformers.SEWDConfig.norm_rel_ebd",description:`norm_rel_ebd (str
, optional, defaults to "layer_norm"
) —
Whether to use layer norm in relative embedding ("layer_norm"
if yes)`,name:"norm_rel_ebd"},{anchor:"transformers.SEWDConfig.hidden_act",description:`hidden_act (str
or function
, optional, defaults to "gelu_python"
) —
The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu"
,
"relu"
, "selu"
, "gelu_python"
and "gelu_new"
are supported.`,name:"hidden_act"},{anchor:"transformers.SEWDConfig.hidden_dropout",description:`hidden_dropout (float
, optional, defaults to 0.1) —
Deprecated. Not used by the model and will be removed in a future version.`,name:"hidden_dropout"},{anchor:"transformers.SEWDConfig.activation_dropout",description:`activation_dropout (float
, optional, defaults to 0.1) —
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.`,name:"activation_dropout"},{anchor:"transformers.SEWDConfig.attention_dropout",description:`attention_dropout (float
, optional, defaults to 0.1) —
The dropout ratio for the attention probabilities.`,name:"attention_dropout"},{anchor:"transformers.SEWDConfig.final_dropout",description:`final_dropout (float
, optional, defaults to 0.1) —
The dropout probability for the final projection layer of SEWDForCTC.`,name:"final_dropout"},{anchor:"transformers.SEWDConfig.initializer_range",description:`initializer_range (float
, optional, defaults to 0.02) —
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.`,name:"initializer_range"},{anchor:"transformers.SEWDConfig.layer_norm_eps",description:`layer_norm_eps (float
, optional, defaults to 1e-7) —
The epsilon used by the layer normalization layers in the transformer encoder.`,name:"layer_norm_eps"},{anchor:"transformers.SEWDConfig.feature_layer_norm_eps",description:`feature_layer_norm_eps (float
, optional, defaults to 1e-5) —
The epsilon used by the layer normalization after the feature encoder.`,name:"feature_layer_norm_eps"},{anchor:"transformers.SEWDConfig.feat_extract_norm",description:`feat_extract_norm (str
, optional, defaults to "group"
) —
The norm to be applied to 1D convolutional layers in feature encoder. One of "group"
for group
normalization of only the first 1D convolutional layer or "layer"
for layer normalization of all 1D
convolutional layers.`,name:"feat_extract_norm"},{anchor:"transformers.SEWDConfig.feat_proj_dropout",description:`feat_proj_dropout (float
, optional, defaults to 0.0) —
The dropout probability for output of the feature encoder.`,name:"feat_proj_dropout"},{anchor:"transformers.SEWDConfig.feat_extract_activation",description:"feat_extract_activation (str,
optional, defaults to
“gelu”) -- The non-linear activation function (function or string) in the 1D convolutional layers of the feature extractor. If string,
“gelu”,
“relu”,
“selu”and
“gelu_new”` are supported.",name:"feat_extract_activation"},{anchor:"transformers.SEWDConfig.conv_dim",description:`conv_dim (tuple[int]
or list[int]
, optional, defaults to (64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)
) —
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature encoder. The length of conv_dim defines the number of 1D convolutional layers.`,name:"conv_dim"},{anchor:"transformers.SEWDConfig.conv_stride",description:`conv_stride (tuple[int]
or list[int]
, optional, defaults to (5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)
) —
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of conv_stride defines the number of convolutional layers and has to match the length of conv_dim.`,name:"conv_stride"},{anchor:"transformers.SEWDConfig.conv_kernel",description:`conv_kernel (tuple[int]
or list[int]
, optional, defaults to (10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)
) —
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of conv_kernel defines the number of convolutional layers and has to match the length of
conv_dim.`,name:"conv_kernel"},{anchor:"transformers.SEWDConfig.conv_bias",description:`conv_bias (bool
, optional, defaults to False
) —
Whether the 1D convolutional layers have a bias.`,name:"conv_bias"},{anchor:"transformers.SEWDConfig.num_conv_pos_embeddings",description:`num_conv_pos_embeddings (int
, optional, defaults to 128) —
Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
embeddings layer.`,name:"num_conv_pos_embeddings"},{anchor:"transformers.SEWDConfig.num_conv_pos_embedding_groups",description:`num_conv_pos_embedding_groups (int
, optional, defaults to 16) —
Number of groups of 1D convolutional positional embeddings layer.`,name:"num_conv_pos_embedding_groups"},{anchor:"transformers.SEWDConfig.apply_spec_augment",description:`apply_spec_augment (bool
, optional, defaults to True
) —
Whether to apply SpecAugment data augmentation to the outputs of the feature encoder. For reference see
SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition.`,name:"apply_spec_augment"},{anchor:"transformers.SEWDConfig.mask_time_prob",description:`mask_time_prob (float
, optional, defaults to 0.05) —
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
procedure generates ”mask_time_problen(time_axis)/mask_time_length” independent masks over the axis. If
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
masked, mask_time_prob should be \`prob_vector_startmask_time_length. Note that overlap may decrease the actual percentage of masked vectors. This is only relevant if
apply_spec_augment is True\`.`,name:"mask_time_prob"},{anchor:"transformers.SEWDConfig.mask_time_length",description:`mask_time_length (int
, optional, defaults to 10) —
Length of vector span along the time axis.`,name:"mask_time_length"},{anchor:"transformers.SEWDConfig.mask_time_min_masks",description:`mask_time_min_masks (int
, optional, defaults to 2), —
The minimum number of masks of length mask_feature_length
generated along the time axis, each time step,
irrespectively of mask_feature_prob
. Only relevant if ”mask_time_prob*len(time_axis)/mask_time_length <
mask_time_min_masks”`,name:"mask_time_min_masks"},{anchor:"transformers.SEWDConfig.mask_feature_prob",description:`mask_feature_prob (float
, optional, defaults to 0.0) —
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
masking procedure generates ”mask_feature_problen(feature_axis)/mask_time_length” independent masks over
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
span to be masked, mask_feature_prob should be \`prob_vector_startmask_feature_length. Note that overlap may decrease the actual percentage of masked vectors. This is only relevant if
apply_spec_augment is
True\`.`,name:"mask_feature_prob"},{anchor:"transformers.SEWDConfig.mask_feature_length",description:`mask_feature_length (int
, optional, defaults to 10) —
Length of vector span along the feature axis.`,name:"mask_feature_length"},{anchor:"transformers.SEWDConfig.mask_feature_min_masks",description:`mask_feature_min_masks (int
, optional, defaults to 0), —
The minimum number of masks of length mask_feature_length
generated along the feature axis, each time
step, irrespectively of mask_feature_prob
. Only relevant if
”mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks”`,name:"mask_feature_min_masks"},{anchor:"transformers.SEWDConfig.diversity_loss_weight",description:`diversity_loss_weight (int
, optional, defaults to 0.1) —
The weight of the codebook diversity loss component.`,name:"diversity_loss_weight"},{anchor:"transformers.SEWDConfig.ctc_loss_reduction",description:`ctc_loss_reduction (str
, optional, defaults to "sum"
) —
Specifies the reduction to apply to the output of torch.nn.CTCLoss
. Only relevant when training an
instance of SEWDForCTC.`,name:"ctc_loss_reduction"},{anchor:"transformers.SEWDConfig.ctc_zero_infinity",description:`ctc_zero_infinity (bool
, optional, defaults to False
) —
Whether to zero infinite losses and the associated gradients of torch.nn.CTCLoss
. Infinite losses mainly
occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
of SEWDForCTC.`,name:"ctc_zero_infinity"},{anchor:"transformers.SEWDConfig.use_weighted_layer_sum",description:`use_weighted_layer_sum (bool
, optional, defaults to False
) —
Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
instance of Wav2Vec2ForSequenceClassification.`,name:"use_weighted_layer_sum"},{anchor:"transformers.SEWDConfig.classifier_proj_size",description:`classifier_proj_size (int
, optional, defaults to 256) —
Dimensionality of the projection before token mean-pooling for classification.`,name:"classifier_proj_size"}],source:"https://github.com/huggingface/transformers/blob/v4.56.2/src/transformers/models/sew_d/configuration_sew_d.py#L27"}}),G=new at({props:{anchor:"transformers.SEWDConfig.example",$$slots:{default:[io]},$$scope:{ctx:M}}}),ce=new Q({props:{name:"to_dict",anchor:"transformers.SEWDConfig.to_dict",parameters:[],source:"https://github.com/huggingface/transformers/blob/v4.56.2/src/transformers/models/sew_d/configuration_sew_d.py#L282"}}),de=new P({props:{title:"SEWDModel",local:"transformers.SEWDModel",headingTag:"h2"}}),me=new Q({props:{name:"class transformers.SEWDModel",anchor:"transformers.SEWDModel",parameters:[{name:"config",val:": SEWDConfig"}],parametersDescription:[{anchor:"transformers.SEWDModel.config",description:`config (SEWDConfig) —
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
from_pretrained() method to load the model weights.`,name:"config"}],source:"https://github.com/huggingface/transformers/blob/v4.56.2/src/transformers/models/sew_d/modeling_sew_d.py#L1257"}}),pe=new Q({props:{name:"forward",anchor:"transformers.SEWDModel.forward",parameters:[{name:"input_values",val:": typing.Optional[torch.Tensor]"},{name:"attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"mask_time_indices",val:": typing.Optional[torch.FloatTensor] = None"},{name:"output_attentions",val:": typing.Optional[bool] = None"},{name:"output_hidden_states",val:": typing.Optional[bool] = None"},{name:"return_dict",val:": typing.Optional[bool] = None"}],parametersDescription:[{anchor:"transformers.SEWDModel.forward.input_values",description:`input_values (torch.Tensor
of shape (batch_size, sequence_length)
, optional) —
Float values of input raw speech waveform. Values can be obtained by loading a .flac
or .wav
audio file
into an array of type list[float]
, a numpy.ndarray
or a torch.Tensor
, e.g. via the torchcodec library
(pip install torchcodec
) or the soundfile library (pip install soundfile
).
To prepare the array into input_values
, the AutoProcessor should be used for padding and conversion
into a tensor of type torch.FloatTensor
. See Wav2Vec2Processor.call() for details.`,name:"input_values"},{anchor:"transformers.SEWDModel.forward.attention_mask",description:`attention_mask (torch.Tensor
of shape (batch_size, sequence_length)
, optional) —
Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
:
What are attention masks?`,name:"attention_mask"},{anchor:"transformers.SEWDModel.forward.mask_time_indices",description:`mask_time_indices (torch.BoolTensor
of shape (batch_size, sequence_length)
, optional) —
Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
masked extracted features in config.proj_codevector_dim space.`,name:"mask_time_indices"},{anchor:"transformers.SEWDModel.forward.output_attentions",description:`output_attentions (bool
, optional) —
Whether or not to return the attentions tensors of all attention layers. See attentions
under returned
tensors for more detail.`,name:"output_attentions"},{anchor:"transformers.SEWDModel.forward.output_hidden_states",description:`output_hidden_states (bool
, optional) —
Whether or not to return the hidden states of all layers. See hidden_states
under returned tensors for
more detail.`,name:"output_hidden_states"},{anchor:"transformers.SEWDModel.forward.return_dict",description:`return_dict (bool
, optional) —
Whether or not to return a ModelOutput instead of a plain tuple.`,name:"return_dict"}],source:"https://github.com/huggingface/transformers/blob/v4.56.2/src/transformers/models/sew_d/modeling_sew_d.py#L1324",returnDescription:`