Skip to main content
Get a list of models that are currently loaded in memory, including their memory usage and expiration time.

Request

Endpoint

GET /api/ps

Request Parameters

No parameters required.

Response

Response Fields

models
array
Array of currently loaded model objects

Examples

curl http://localhost:11434/api/ps

Example Response

{
  "models": [
    {
      "name": "llama3.2",
      "model": "llama3.2",
      "size": 3826793677,
      "size_vram": 3640655872,
      "digest": "sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8",
      "details": {
        "format": "gguf",
        "family": "llama",
        "families": ["llama"],
        "parameter_size": "3B",
        "quantization_level": "Q4_K_M"
      },
      "expires_at": "2024-02-24T12:39:56.789Z",
      "context_length": 4096
    },
    {
      "name": "mistral:7b-instruct",
      "model": "mistral:7b-instruct",
      "size": 4109865159,
      "size_vram": 3942645760,
      "digest": "sha256:2ae6f6dd7a3dd734790bbbf58b8909a606e0e7e97e94b7604e0aa7ae4490e6d8",
      "details": {
        "format": "gguf",
        "family": "mistral",
        "families": ["mistral"],
        "parameter_size": "7B",
        "quantization_level": "Q4_0"
      },
      "expires_at": "2024-02-24T12:41:23.456Z",
      "context_length": 8192
    }
  ]
}

Monitor Memory Usage

import requests
import time

def monitor_memory():
    while True:
        response = requests.get('http://localhost:11434/api/ps')
        models = response.json()['models']
        
        total_vram = sum(m['size_vram'] for m in models)
        total_vram_gb = total_vram / (1024 ** 3)
        
        print(f"\rLoaded models: {len(models)} | "
              f"Total VRAM: {total_vram_gb:.2f} GB", end='')
        
        time.sleep(2)

monitor_memory()

Check if Model is Loaded

async function isModelLoaded(modelName) {
  const response = await fetch('http://localhost:11434/api/ps');
  const data = await response.json();
  
  return data.models.some(m => m.name === modelName);
}

if (await isModelLoaded('llama3.2')) {
  console.log('Model is ready in memory');
} else {
  console.log('Model needs to be loaded');
}

Memory Usage Report

import requests

def memory_report():
    response = requests.get('http://localhost:11434/api/ps')
    models = response.json()['models']
    
    if not models:
        print("No models currently loaded")
        return
    
    print("=== Loaded Models Memory Report ===")
    print()
    
    total_size = 0
    total_vram = 0
    
    for model in models:
        size_gb = model['size'] / (1024 ** 3)
        vram_gb = model['size_vram'] / (1024 ** 3)
        total_size += model['size']
        total_vram += model['size_vram']
        
        vram_percent = (model['size_vram'] / model['size'] * 100) if model['size'] > 0 else 0
        
        print(f"Model: {model['name']}")
        print(f"  Parameters: {model['details']['parameter_size']}")
        print(f"  Total size: {size_gb:.2f} GB")
        print(f"  VRAM usage: {vram_gb:.2f} GB ({vram_percent:.1f}%)")
        print(f"  Context: {model['context_length']} tokens")
        print()
    
    print(f"Total: {len(models)} models")
    print(f"Total size: {total_size / (1024 ** 3):.2f} GB")
    print(f"Total VRAM: {total_vram / (1024 ** 3):.2f} GB")

memory_report()

Error Responses

error
string
Description of the error

Common Errors

  • 500 Internal Server Error: Error accessing scheduler state
Models are sorted by expiration time, with models that will expire soonest appearing first.
The expires_at time updates whenever the model is used. Models are automatically unloaded from memory after their expiration time.
size represents the total model size, while size_vram shows how much is actually loaded in GPU memory. The difference is typically loaded in system RAM or offloaded.