Running LLMs on AOS (Highly Experimental)

You can run LLMs on top of AOS using the right module.

First create a test project.

npx wao create llm && cd llm

Create a directory and download one of the tiny models from Hugging Face.

We will try TinyLlama-1.1B-Chat-v1.0-GGUF for this tutorial.

mkdir test/models
curl -L -o test/models/tinyllama.gguf "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q2_K.gguf?download=true"

Write tests with WAO.

import assert from "assert"
import { resolve } from "path"
import { readFileSync } from "fs"
import { afterEach, after, describe, it, before, beforeEach } from "node:test"
import { AO, acc } from "wao/test"
const __dirname = import.meta.dirname
const src_data = `
Llama = require(".Llama")
Llama.logLevel = 4
 
Handlers.add("Load", "Load", function (msg)
  Llama.load("/data/" .. msg.ModelID)
  msg.reply({ Data = "true" })
end)
 
Handlers.add("Ask", "Ask", function (msg)
  Llama.setPrompt(msg.Q)
  msg.reply({ Data = Llama.run(50) })
end)`
 
describe("LLM", function () {
  it("should infer with Tinyllama", async () => {
    const ao = await new AO().init(acc[0])
    const model = readFileSync(resolve(__dirname, "models/tinyllama.gguf"))
    const { id } = await ao.ar.post({ data: model })
    const data = readFileSync(
      resolve(__dirname, "../node_modules/wao/esm/lua/llama.wasm"),
    )
    const { id: modid } = await ao.postModule({
      data,
      tags: { "Memory-Limit": "1-gb" },
    })
    const { p, pid, err } = await ao.deploy({
      tags: { Extension: "WeaveDrive", Attestor: ao.ar.addr },
      module: modid,
      src_data,
    })
    await ao.attest({ id })
    await p.m("Load", { ModelID: id })
    console.log(await p.d("Ask", { Q: "How are you?" }, false))
  })
})

phi-2-GGUF would be a much better model for chat, but a bit too heavy for CPU.

mkdir test/models
curl -L -o test/models/phi2.gguf "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q2_K.gguf?download=true"

it("should infer with Phi2", async () => {
  const ao = await new AO().init(acc[0])
  const model = readFileSync(resolve(__dirname, "models/phi2.gguf"))
  const { id } = await ao.ar.post({ data: model })
  const data = readFileSync(
    resolve(__dirname, "../node_modules/wao/esm/lua/llama.wasm"),
  )
  const { id: modid } = await ao.postModule({
    data,
    tags: { "Memory-Limit": "2-gb" }, // the model size is more than 1GB
  })
  const { p, pid, err } = await ao.deploy({
    tags: { Extension: "WeaveDrive", Attestor: ao.ar.addr },
    module: modid,
    src_data,
  })
  await ao.attest({ id })
  await p.m("Load", { ModelID: id })
  console.log(await p.d("Ask", { Q: "How are you?" }, false))
})