Get a list of models that are currently loaded in memory, including their memory usage and expiration time.
Request
Endpoint
Request Parameters
No parameters required.
Response
Response Fields
Array of currently loaded model objects Full model identifier (same as name)
Total size of the model in bytes
Size of model loaded in GPU memory (VRAM) in bytes
SHA256 digest of the model
Model architecture details Model format (e.g., “gguf”)
Model architecture family
Array of architecture families
Human-readable parameter count
Timestamp when the model will be unloaded from memory (ISO 8601)
Current context window size in tokens
Examples
curl http://localhost:11434/api/ps
Example Response
{
"models" : [
{
"name" : "llama3.2" ,
"model" : "llama3.2" ,
"size" : 3826793677 ,
"size_vram" : 3640655872 ,
"digest" : "sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8" ,
"details" : {
"format" : "gguf" ,
"family" : "llama" ,
"families" : [ "llama" ],
"parameter_size" : "3B" ,
"quantization_level" : "Q4_K_M"
},
"expires_at" : "2024-02-24T12:39:56.789Z" ,
"context_length" : 4096
},
{
"name" : "mistral:7b-instruct" ,
"model" : "mistral:7b-instruct" ,
"size" : 4109865159 ,
"size_vram" : 3942645760 ,
"digest" : "sha256:2ae6f6dd7a3dd734790bbbf58b8909a606e0e7e97e94b7604e0aa7ae4490e6d8" ,
"details" : {
"format" : "gguf" ,
"family" : "mistral" ,
"families" : [ "mistral" ],
"parameter_size" : "7B" ,
"quantization_level" : "Q4_0"
},
"expires_at" : "2024-02-24T12:41:23.456Z" ,
"context_length" : 8192
}
]
}
Monitor Memory Usage
import requests
import time
def monitor_memory ():
while True :
response = requests.get( 'http://localhost:11434/api/ps' )
models = response.json()[ 'models' ]
total_vram = sum (m[ 'size_vram' ] for m in models)
total_vram_gb = total_vram / ( 1024 ** 3 )
print ( f " \r Loaded models: { len (models) } | "
f "Total VRAM: { total_vram_gb :.2f} GB" , end = '' )
time.sleep( 2 )
monitor_memory()
Check if Model is Loaded
async function isModelLoaded ( modelName ) {
const response = await fetch ( 'http://localhost:11434/api/ps' );
const data = await response . json ();
return data . models . some ( m => m . name === modelName );
}
if ( await isModelLoaded ( 'llama3.2' )) {
console . log ( 'Model is ready in memory' );
} else {
console . log ( 'Model needs to be loaded' );
}
Memory Usage Report
import requests
def memory_report ():
response = requests.get( 'http://localhost:11434/api/ps' )
models = response.json()[ 'models' ]
if not models:
print ( "No models currently loaded" )
return
print ( "=== Loaded Models Memory Report ===" )
print ()
total_size = 0
total_vram = 0
for model in models:
size_gb = model[ 'size' ] / ( 1024 ** 3 )
vram_gb = model[ 'size_vram' ] / ( 1024 ** 3 )
total_size += model[ 'size' ]
total_vram += model[ 'size_vram' ]
vram_percent = (model[ 'size_vram' ] / model[ 'size' ] * 100 ) if model[ 'size' ] > 0 else 0
print ( f "Model: { model[ 'name' ] } " )
print ( f " Parameters: { model[ 'details' ][ 'parameter_size' ] } " )
print ( f " Total size: { size_gb :.2f} GB" )
print ( f " VRAM usage: { vram_gb :.2f} GB ( { vram_percent :.1f} %)" )
print ( f " Context: { model[ 'context_length' ] } tokens" )
print ()
print ( f "Total: { len (models) } models" )
print ( f "Total size: { total_size / ( 1024 ** 3 ) :.2f} GB" )
print ( f "Total VRAM: { total_vram / ( 1024 ** 3 ) :.2f} GB" )
memory_report()
Error Responses
Common Errors
500 Internal Server Error : Error accessing scheduler state
Models are sorted by expiration time, with models that will expire soonest appearing first.
The expires_at time updates whenever the model is used. Models are automatically unloaded from memory after their expiration time.
size represents the total model size, while size_vram shows how much is actually loaded in GPU memory. The difference is typically loaded in system RAM or offloaded.