import{s as fe,n as ge,o as be}from"../chunks/scheduler.defa9a21.js";import{S as Ge,i as Ce,g as i,s,r as w,A as Ae,h as o,f as l,c as n,j as Ie,u as M,x as c,k as Ue,y as ve,a,v as p,d,t as J,w as T}from"../chunks/index.fe795e71.js";import{C as ce}from"../chunks/CodeBlock.204b6c34.js";import{H as _,E as $e}from"../chunks/getInferenceSnippets.2234a8dd.js";function Be(re){let r,W,k,S,y,x,m,we=`Local SGD is a technique for distributed training where gradients are not synchronized every step. Thus, each process updates its own version of the model weights and after a given number of steps these weights are synchronized by averaging across all processes. This improves communication efficiency and can lead to substantial training speed up especially when a computer lacks a faster interconnect such as NVLink. Unlike gradient accumulation (where improving communication efficiency requires increasing the effective batch size), Local SGD does not require changing a batch size or a learning rate / schedule. However, if necessary, Local SGD can be combined with gradient accumulation as well.`,N,h,Me="In this tutorial you will see how to quickly setup Local SGD Accelerate. Compared to a standard Accelerate setup, this requires only two extra lines of code.",Y,u,pe="This example will use a very simplistic PyTorch training loop that performs gradient accumulation every two batches:",F,j,E,I,L,U,de="First the code shown earlier will be converted to use Accelerate with neither a LocalSGD or a gradient accumulation helper:",H,f,Q,g,D,b,Je=`All that is left now is to let Accelerate handle model parameter synchronization and the gradient accumulation for us. For simplicity let us assume we need to synchronize every 8 steps. This is achieved by adding one with LocalSGD statement and one call local_sgd.step() after every optimizer step:`,K,G,P,C,Te="Under the hood, the Local SGD code disables automatic gradient synchronization (but accumulation still works as expected!). Instead it averages model parameters every local_sgd_steps steps (as well as at the end of the training loop).",q,A,O,v,ye='The current implementation works only with basic multi-GPU (or multi-CPU) training without, e.g., DeepSpeed..',ee,$,te,B,me=`Although we are not aware of the true origins of this simple approach, the idea of local SGD is quite old and goes back to at least:`,le,Z,he=`Zhang, J., De Sa, C., Mitliagkas, I., & Ré, C. (2016). Parallel SGD: When does averaging help?. arXiv preprint arXiv:1606.07365.`,ae,V,ue="We credit the term Local SGD to the following paper (but there might be earlier references we are not aware of).",se,z,je=`Stich, Sebastian Urban. “Local SGD Converges Fast and Communicates Little.” ICLR 2019-International Conference on Learning Representations. No. CONF. 2019.`,ne,R,ie,X,oe;return y=new _({props:{title:"Using Local SGD with Accelerate",local:"using-local-sgd-with-accelerate",headingTag:"h1"}}),j=new ce({props:{code:"ZGV2aWNlJTIwJTNEJTIwJTIyY3VkYSUyMiUwQW1vZGVsLnRvKGRldmljZSklMEElMEFncmFkaWVudF9hY2N1bXVsYXRpb25fc3RlcHMlMjAlM0QlMjAyJTBBJTBBZm9yJTIwaW5kZXglMkMlMjBiYXRjaCUyMGluJTIwZW51bWVyYXRlKHRyYWluaW5nX2RhdGFsb2FkZXIpJTNBJTBBJTIwJTIwJTIwJTIwaW5wdXRzJTJDJTIwdGFyZ2V0cyUyMCUzRCUyMGJhdGNoJTBBJTIwJTIwJTIwJTIwaW5wdXRzJTIwJTNEJTIwaW5wdXRzLnRvKGRldmljZSklMEElMjAlMjAlMjAlMjB0YXJnZXRzJTIwJTNEJTIwdGFyZ2V0cy50byhkZXZpY2UpJTBBJTIwJTIwJTIwJTIwb3V0cHV0cyUyMCUzRCUyMG1vZGVsKGlucHV0cyklMEElMjAlMjAlMjAlMjBsb3NzJTIwJTNEJTIwbG9zc19mdW5jdGlvbihvdXRwdXRzJTJDJTIwdGFyZ2V0cyklMEElMjAlMjAlMjAlMjBsb3NzJTIwJTNEJTIwbG9zcyUyMCUyRiUyMGdyYWRpZW50X2FjY3VtdWxhdGlvbl9zdGVwcyUwQSUyMCUyMCUyMCUyMGxvc3MuYmFja3dhcmQoKSUwQSUyMCUyMCUyMCUyMGlmJTIwKGluZGV4JTIwJTJCJTIwMSklMjAlMjUlMjBncmFkaWVudF9hY2N1bXVsYXRpb25fc3RlcHMlMjAlM0QlM0QlMjAwJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwb3B0aW1pemVyLnN0ZXAoKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHNjaGVkdWxlci5zdGVwKCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBvcHRpbWl6ZXIuemVyb19ncmFkKCk=",highlighted:`device = "cuda" model.to(device) gradient_accumulation_steps = 2 for index, batch in enumerate(training_dataloader): inputs, targets = batch inputs = inputs.to(device) targets = targets.to(device) outputs = model(inputs) loss = loss_function(outputs, targets) loss = loss / gradient_accumulation_steps loss.backward() if (index + 1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad()`,wrap:!1}}),I=new _({props:{title:"Converting it to Accelerate",local:"converting-it-to-accelerate",headingTag:"h2"}}),f=new ce({props:{code:"JTJCJTIwZnJvbSUyMGFjY2VsZXJhdGUlMjBpbXBvcnQlMjBBY2NlbGVyYXRvciUwQSUyQiUyMGFjY2VsZXJhdG9yJTIwJTNEJTIwQWNjZWxlcmF0b3IoKSUwQSUwQSUyQiUyMG1vZGVsJTJDJTIwb3B0aW1pemVyJTJDJTIwdHJhaW5pbmdfZGF0YWxvYWRlciUyQyUyMHNjaGVkdWxlciUyMCUzRCUyMGFjY2VsZXJhdG9yLnByZXBhcmUoJTBBJTJCJTIwJTIwJTIwJTIwJTIwbW9kZWwlMkMlMjBvcHRpbWl6ZXIlMkMlMjB0cmFpbmluZ19kYXRhbG9hZGVyJTJDJTIwc2NoZWR1bGVyJTBBJTJCJTIwKSUwQSUwQSUyMCUyMGZvciUyMGluZGV4JTJDJTIwYmF0Y2glMjBpbiUyMGVudW1lcmF0ZSh0cmFpbmluZ19kYXRhbG9hZGVyKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMGlucHV0cyUyQyUyMHRhcmdldHMlMjAlM0QlMjBiYXRjaCUwQS0lMjAlMjAlMjAlMjAlMjBpbnB1dHMlMjAlM0QlMjBpbnB1dHMudG8oZGV2aWNlKSUwQS0lMjAlMjAlMjAlMjAlMjB0YXJnZXRzJTIwJTNEJTIwdGFyZ2V0cy50byhkZXZpY2UpJTBBJTIwJTIwJTIwJTIwJTIwJTIwb3V0cHV0cyUyMCUzRCUyMG1vZGVsKGlucHV0cyklMEElMjAlMjAlMjAlMjAlMjAlMjBsb3NzJTIwJTNEJTIwbG9zc19mdW5jdGlvbihvdXRwdXRzJTJDJTIwdGFyZ2V0cyklMEElMjAlMjAlMjAlMjAlMjAlMjBsb3NzJTIwJTNEJTIwbG9zcyUyMCUyRiUyMGdyYWRpZW50X2FjY3VtdWxhdGlvbl9zdGVwcyUwQSUyQiUyMCUyMCUyMCUyMCUyMGFjY2VsZXJhdG9yLmJhY2t3YXJkKGxvc3MpJTBBJTIwJTIwJTIwJTIwJTIwJTIwaWYlMjAoaW5kZXglMkIxKSUyMCUyNSUyMGdyYWRpZW50X2FjY3VtdWxhdGlvbl9zdGVwcyUyMCUzRCUzRCUyMDAlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBvcHRpbWl6ZXIuc3RlcCgpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2NoZWR1bGVyLnN0ZXAoKQ==",highlighted:`+ from accelerate import Accelerator + accelerator = Accelerator() + model, optimizer, training_dataloader, scheduler = accelerator.prepare( + model, optimizer, training_dataloader, scheduler + ) for index, batch in enumerate(training_dataloader): inputs, targets = batch - inputs = inputs.to(device) - targets = targets.to(device) outputs = model(inputs) loss = loss_function(outputs, targets) loss = loss / gradient_accumulation_steps + accelerator.backward(loss) if (index+1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step()`,wrap:!1}}),g=new _({props:{title:"Letting Accelerate handle model synchronization",local:"letting-accelerate-handle-model-synchronization",headingTag:"h2"}}),G=new ce({props:{code:"JTJCbG9jYWxfc2dkX3N0ZXBzJTNEOCUwQSUwQSUyQndpdGglMjBMb2NhbFNHRChhY2NlbGVyYXRvciUzRGFjY2VsZXJhdG9yJTJDJTIwbW9kZWwlM0Rtb2RlbCUyQyUyMGxvY2FsX3NnZF9zdGVwcyUzRDglMkMlMjBlbmFibGVkJTNEVHJ1ZSklMjBhcyUyMGxvY2FsX3NnZCUzQSUwQSUyMCUyMCUyMCUyMGZvciUyMGJhdGNoJTIwaW4lMjB0cmFpbmluZ19kYXRhbG9hZGVyJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwd2l0aCUyMGFjY2VsZXJhdG9yLmFjY3VtdWxhdGUobW9kZWwpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwaW5wdXRzJTJDJTIwdGFyZ2V0cyUyMCUzRCUyMGJhdGNoJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwb3V0cHV0cyUyMCUzRCUyMG1vZGVsKGlucHV0cyklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBsb3NzJTIwJTNEJTIwbG9zc19mdW5jdGlvbihvdXRwdXRzJTJDJTIwdGFyZ2V0cyklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBhY2NlbGVyYXRvci5iYWNrd2FyZChsb3NzKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMG9wdGltaXplci5zdGVwKCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzY2hlZHVsZXIuc3RlcCgpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwb3B0aW1pemVyLnplcm9fZ3JhZCgpJTBBJTJCJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbG9jYWxfc2dkLnN0ZXAoKQ==",highlighted:`+local_sgd_steps=8 +with LocalSGD(accelerator=accelerator, model=model, local_sgd_steps=8, enabled=True) as local_sgd: for batch in training_dataloader: with accelerator.accumulate(model): inputs, targets = batch outputs = model(inputs) loss = loss_function(outputs, targets) accelerator.backward(loss) optimizer.step() scheduler.step() optimizer.zero_grad() + local_sgd.step()`,wrap:!1}}),A=new _({props:{title:"Limitations",local:"limitations",headingTag:"h2"}}),$=new _({props:{title:"References",local:"references",headingTag:"h2"}}),R=new $e({props:{source:"https://github.com/huggingface/accelerate/blob/main/docs/source/usage_guides/local_sgd.md"}}),{c(){r=i("meta"),W=s(),k=i("p"),S=s(),w(y.$$.fragment),x=s(),m=i("p"),m.textContent=we,N=s(),h=i("p"),h.textContent=Me,Y=s(),u=i("p"),u.textContent=pe,F=s(),w(j.$$.fragment),E=s(),w(I.$$.fragment),L=s(),U=i("p"),U.textContent=de,H=s(),w(f.$$.fragment),Q=s(),w(g.$$.fragment),D=s(),b=i("p"),b.innerHTML=Je,K=s(),w(G.$$.fragment),P=s(),C=i("p"),C.innerHTML=Te,q=s(),w(A.$$.fragment),O=s(),v=i("p"),v.innerHTML=ye,ee=s(),w($.$$.fragment),te=s(),B=i("p"),B.textContent=me,le=s(),Z=i("p"),Z.innerHTML=he,ae=s(),V=i("p"),V.textContent=ue,se=s(),z=i("p"),z.innerHTML=je,ne=s(),w(R.$$.fragment),ie=s(),X=i("p"),this.h()},l(e){const t=Ae("svelte-u9bgzb",document.head);r=o(t,"META",{name:!0,content:!0}),t.forEach(l),W=n(e),k=o(e,"P",{}),Ie(k).forEach(l),S=n(e),M(y.$$.fragment,e),x=n(e),m=o(e,"P",{"data-svelte-h":!0}),c(m)!=="svelte-1gzsxbq"&&(m.textContent=we),N=n(e),h=o(e,"P",{"data-svelte-h":!0}),c(h)!=="svelte-1d63qys"&&(h.textContent=Me),Y=n(e),u=o(e,"P",{"data-svelte-h":!0}),c(u)!=="svelte-n9bxrd"&&(u.textContent=pe),F=n(e),M(j.$$.fragment,e),E=n(e),M(I.$$.fragment,e),L=n(e),U=o(e,"P",{"data-svelte-h":!0}),c(U)!=="svelte-19w42d7"&&(U.textContent=de),H=n(e),M(f.$$.fragment,e),Q=n(e),M(g.$$.fragment,e),D=n(e),b=o(e,"P",{"data-svelte-h":!0}),c(b)!=="svelte-192byqc"&&(b.innerHTML=Je),K=n(e),M(G.$$.fragment,e),P=n(e),C=o(e,"P",{"data-svelte-h":!0}),c(C)!=="svelte-avyf7q"&&(C.innerHTML=Te),q=n(e),M(A.$$.fragment,e),O=n(e),v=o(e,"P",{"data-svelte-h":!0}),c(v)!=="svelte-10m76zr"&&(v.innerHTML=ye),ee=n(e),M($.$$.fragment,e),te=n(e),B=o(e,"P",{"data-svelte-h":!0}),c(B)!=="svelte-1uio6eh"&&(B.textContent=me),le=n(e),Z=o(e,"P",{"data-svelte-h":!0}),c(Z)!=="svelte-tb25yv"&&(Z.innerHTML=he),ae=n(e),V=o(e,"P",{"data-svelte-h":!0}),c(V)!=="svelte-5vkth6"&&(V.textContent=ue),se=n(e),z=o(e,"P",{"data-svelte-h":!0}),c(z)!=="svelte-7r2o7s"&&(z.innerHTML=je),ne=n(e),M(R.$$.fragment,e),ie=n(e),X=o(e,"P",{}),Ie(X).forEach(l),this.h()},h(){Ue(r,"name","hf:doc:metadata"),Ue(r,"content",Ze)},m(e,t){ve(document.head,r),a(e,W,t),a(e,k,t),a(e,S,t),p(y,e,t),a(e,x,t),a(e,m,t),a(e,N,t),a(e,h,t),a(e,Y,t),a(e,u,t),a(e,F,t),p(j,e,t),a(e,E,t),p(I,e,t),a(e,L,t),a(e,U,t),a(e,H,t),p(f,e,t),a(e,Q,t),p(g,e,t),a(e,D,t),a(e,b,t),a(e,K,t),p(G,e,t),a(e,P,t),a(e,C,t),a(e,q,t),p(A,e,t),a(e,O,t),a(e,v,t),a(e,ee,t),p($,e,t),a(e,te,t),a(e,B,t),a(e,le,t),a(e,Z,t),a(e,ae,t),a(e,V,t),a(e,se,t),a(e,z,t),a(e,ne,t),p(R,e,t),a(e,ie,t),a(e,X,t),oe=!0},p:ge,i(e){oe||(d(y.$$.fragment,e),d(j.$$.fragment,e),d(I.$$.fragment,e),d(f.$$.fragment,e),d(g.$$.fragment,e),d(G.$$.fragment,e),d(A.$$.fragment,e),d($.$$.fragment,e),d(R.$$.fragment,e),oe=!0)},o(e){J(y.$$.fragment,e),J(j.$$.fragment,e),J(I.$$.fragment,e),J(f.$$.fragment,e),J(g.$$.fragment,e),J(G.$$.fragment,e),J(A.$$.fragment,e),J($.$$.fragment,e),J(R.$$.fragment,e),oe=!1},d(e){e&&(l(W),l(k),l(S),l(x),l(m),l(N),l(h),l(Y),l(u),l(F),l(E),l(L),l(U),l(H),l(Q),l(D),l(b),l(K),l(P),l(C),l(q),l(O),l(v),l(ee),l(te),l(B),l(le),l(Z),l(ae),l(V),l(se),l(z),l(ne),l(ie),l(X)),l(r),T(y,e),T(j,e),T(I,e),T(f,e),T(g,e),T(G,e),T(A,e),T($,e),T(R,e)}}}const Ze='{"title":"Using Local SGD with Accelerate","local":"using-local-sgd-with-accelerate","sections":[{"title":"Converting it to Accelerate","local":"converting-it-to-accelerate","sections":[],"depth":2},{"title":"Letting Accelerate handle model synchronization","local":"letting-accelerate-handle-model-synchronization","sections":[],"depth":2},{"title":"Limitations","local":"limitations","sections":[],"depth":2},{"title":"References","local":"references","sections":[],"depth":2}],"depth":1}';function Ve(re){return be(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class _e extends Ge{constructor(r){super(),Ce(this,r,Ve,Be,fe,{})}}export{_e as component};