<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/">
  <channel>
    <title>Pretraining on Lil&#39;Log</title>
    <link>https://lilianweng.github.io/tags/pretraining/</link>
    <description>Recent content in Pretraining on Lil&#39;Log</description>
    <generator>Hugo -- gohugo.io</generator>
    <language>en-us</language>
    <lastBuildDate>Wed, 24 Jun 2026 00:00:00 +0000</lastBuildDate><atom:link href="https://lilianweng.github.io/tags/pretraining/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>Scaling Laws, Carefully</title>
      <link>https://lilianweng.github.io/posts/2026-06-24-scaling-laws/</link>
      <pubDate>Wed, 24 Jun 2026 00:00:00 +0000</pubDate>
      
      <guid>https://lilianweng.github.io/posts/2026-06-24-scaling-laws/</guid>
      <description>&lt;p&gt;Scaling laws are one of the most critical empirical findings in deep learning. The observation is simple in form: the training loss $L$ decreases predictably as we scale up model size $N$, dataset size $D$, and compute $C$, following a power-law curve, which appears as a straight line on a log-log plot. We can view scaling laws as a framework for describing the relationship between compute, loss, model size and data; at its core, it is about how to allocate precious compute optimally between $N$ and $D$.&lt;/p&gt;</description>
    </item>
    
  </channel>
</rss>
