在本地运行开源模型,从零开始学习 AI 应用开发
# 一键安装 curl -fsSL https://ollama.com/install.sh | sh # Windows:直接下载安装包 # https://ollama.com/download
# 拉取模型(首次会下载权重文件) ollama pull llama3.2 # 直接在终端对话 ollama run llama3.2 # 查看已安装的模型 ollama list # 查看运行状态 ollama ps
Ollama 启动后自动在本地开放 HTTP API,默认端口 11434。
# 测试 API 是否正常 curl http://localhost:11434/api/generate \ -d '{ "model": "llama3.2", "prompt": "你好,用一句话介绍自己", "stream": false }'
OLLAMA_HOST=0.0.0.0 环境变量即可。根据你的硬件选择模型,显存/内存是主要限制因素。
Ollama 提供两套 API:原生 REST API 和 OpenAI 兼容 API(推荐使用)。
与 OpenAI SDK 完全兼容,只需改 base_url,现有代码零改动迁移。
curl http://localhost:11434/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "llama3.2", "messages": [ {"role": "system", "content": "你是一位 Python 专家"}, {"role": "user", "content": "写一个快速排序函数"} ] }'
curl http://localhost:11434/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "llama3.2", "stream": true, "messages": [{"role": "user", "content": "讲一个笑话"}] }'
# 创建虚拟环境 python -m venv .venv source .venv/bin/activate # Windows: .venv\Scripts\activate # 安装依赖 pip install openai ollama langchain-ollama
from openai import OpenAI client = OpenAI( base_url="http://localhost:11434/v1", api_key="ollama" # 填任意字符串即可 ) response = client.chat.completions.create( model="qwen2.5:7b", messages=[ {"role": "system", "content": "你是一位专业的代码助手"}, {"role": "user", "content": "用 Python 实现一个简单的 HTTP 服务器"}, ], ) print(response.choices[0].message.content)
import ollama # 流式对话 for chunk in ollama.chat( model="llama3.2", messages=[{"role": "user", "content": "解释什么是 Transformer"}], stream=True, ): print(chunk["message"]["content"], end="", flush=True) # 生成 Embeddings result = ollama.embed(model="nomic-embed-text", input="这是一段示例文本") vector = result["embeddings"][0] # 768 维向量
from openai import OpenAI class ChatBot: def __init__(self, model="qwen2.5:7b", system=""): self.client = OpenAI(base_url="http://localhost:11434/v1", api_key="x") self.model = model self.history = [{"role": "system", "content": system}] if system else [] def chat(self, user_input): self.history.append({"role": "user", "content": user_input}) resp = self.client.chat.completions.create( model=self.model, messages=self.history ) reply = resp.choices[0].message.content self.history.append({"role": "assistant", "content": reply}) return reply # 使用示例 bot = ChatBot(system="你是一位耐心的编程导师,用简洁的方式解释概念") print(bot.chat("什么是向量数据库?")) print(bot.chat("它和普通数据库的区别是什么?"))
检索增强生成(RAG)是最实用的 AI 应用模式,让模型能够查阅你自己的文档。
pip install chromadb langchain-ollama langchain-community pypdf
# nomic-embed-text 是 Ollama 上最常用的 embedding 模型
ollama pull nomic-embed-text
from langchain_ollama import OllamaEmbeddings, ChatOllama from langchain_community.vectorstores import Chroma from langchain_community.document_loaders import TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter # 1. 加载文档并切片 loader = TextLoader("my_docs.txt", encoding="utf-8") splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) docs = splitter.split_documents(loader.load()) # 2. 向量化并存入 ChromaDB embeddings = OllamaEmbeddings(model="nomic-embed-text") vectorstore = Chroma.from_documents(docs, embeddings) retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) # 3. 检索 + 生成 def ask(question): context = "\n\n".join([d.page_content for d in retriever.invoke(question)]) llm = ChatOllama(model="qwen2.5:7b") prompt = f"根据以下资料回答问题:\n\n{context}\n\n问题:{question}" return llm.invoke(prompt).content print(ask("文档里提到了哪些核心概念?"))
chunk_size 在 300~800 之间调试,chunk_overlap 保持 chunk_size 的 10% 左右。Agent 让模型能够使用工具、调用函数,完成多步骤复杂任务。
import json from openai import OpenAI client = OpenAI(base_url="http://localhost:11434/v1", api_key="x") # 定义工具 tools = [{ "type": "function", "function": { "name": "get_weather", "description": "获取指定城市的天气信息", "parameters": { "type": "object", "properties": { "city": {"type": "string", "description": "城市名称"} }, "required": ["city"] } } }] # 工具实现 def get_weather(city): return {"city": city, "temp": "22°C", "condition": "晴"} # Agent 循环 messages = [{"role": "user", "content": "北京今天天气怎么样?"}] resp = client.chat.completions.create( model="llama3.1:8b", messages=messages, tools=tools ) msg = resp.choices[0].message if msg.tool_calls: for call in msg.tool_calls: args = json.loads(call.function.arguments) result = get_weather(**args) messages += [ msg, {"role": "tool", "tool_call_id": call.id, "content": json.dumps(result, ensure_ascii=False)} ] final = client.chat.completions.create( model="llama3.1:8b", messages=messages ) print(final.choices[0].message.content)
llama3.1、qwen2.5 支持较好;小于 7B 的模型工具调用稳定性较差。docker run -d -p 3000:8080 --add-host=host.docker.internal:host-gateway ghcr.io/open-webui/open-webui:main