关于多模态浏览器的思考

<!DOCTYPE html>
<html lang=”zh-CN”>
<head>
<meta charset=”UTF-8″>
<meta name=”viewport” content=”width=device-width, initial-scale=1.0, user-scalable=no”>
<title>多模态智能浏览器 · 手势语音交互演示</title>
<!– TensorFlow.js + HandPose 模型 (轻量手势识别) –>
<script src=”https://cdn.jsdelivr.net/npm/@tensorflow/tfjs@3.18.0/dist/tf.min.js”></script>
<script src=”https://cdn.jsdelivr.net/npm/@tensorflow-models/handpose@0.0.7/dist/handpose.min.js”></script>
<style>
* {
box-sizing: border-box;
user-select: none; /* 避免拖动干扰, 但保留文字复制 */
}
body {
margin: 0;
min-height: 100vh;
background: linear-gradient(145deg, #0a0f1e 0%, #0c1222 100%);
font-family: ‘Inter’, ‘Segoe UI’, system-ui, -apple-system, sans-serif;
display: flex;
justify-content: center;
align-items: center;
padding: 20px;
}
/* 主面板 – 未来感玻璃风格 */
.demo-container {
max-width: 1400px;
width: 100%;
background: rgba(15, 25, 45, 0.65);
backdrop-filter: blur(12px);
border-radius: 48px;
border: 1px solid rgba(72, 187, 255, 0.25);
box-shadow: 0 25px 45px rgba(0,0,0,0.4), 0 0 0 1px rgba(0, 255, 255, 0.1) inset;
padding: 24px;
transition: all 0.3s ease;
}
h1 {
margin: 0 0 8px 0;
font-size: 1.9rem;
font-weight: 600;
background: linear-gradient(135deg, #A0E9FF, #6C63FF, #FF6B9D);
-webkit-background-clip: text;
background-clip: text;
color: transparent;
letter-spacing: -0.3px;
}
.sub {
color: #8e9bb5;
margin-bottom: 24px;
border-left: 3px solid #3b82f6;
padding-left: 16px;
font-weight: 400;
font-size: 0.9rem;
}
/* 双栏布局 */
.sensor-grid {
display: flex;
flex-wrap: wrap;
gap: 24px;
}
.camera-card {
flex: 2;
min-width: 280px;
background: rgba(0, 0, 0, 0.45);
border-radius: 32px;
backdrop-filter: blur(4px);
padding: 16px;
border: 1px solid rgba(59,130,246,0.3);
}
.voice-card {
flex: 1.2;
min-width: 260px;
background: rgba(0, 0, 0, 0.45);
border-radius: 32px;
padding: 16px;
border: 1px solid rgba(168,85,247,0.3);
}
.video-wrapper {
position: relative;
background: #000;
border-radius: 24px;
overflow: hidden;
aspect-ratio: 4 / 3;
margin-bottom: 12px;
box-shadow: 0 8px 20px rgba(0,0,0,0.5);
}
video, canvas {
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
object-fit: cover;
border-radius: 20px;
}
canvas {
pointer-events: none;
z-index: 2;
}
video {
z-index: 1;
transform: scaleX(-1); /* 镜像，更自然 */
}
.gesture-status {
background: #0f172ad9;
border-radius: 60px;
padding: 8px 16px;
margin: 12px 0 8px;
display: flex;
align-items: center;
justify-content: space-between;
border: 1px solid #38bdf8;
}
.gesture-label {
font-weight: 600;
background: #1e293b;
padding: 6px 14px;
border-radius: 40px;
font-size: 0.85rem;
color: #b9ffec;
}
.command-badge {
background: #2dd4bf20;
padding: 5px 12px;
border-radius: 24px;
font-family: monospace;
font-size: 0.8rem;
color: #7dd3fc;
}
.btn-group {
display: flex;
gap: 12px;
margin-top: 16px;
flex-wrap: wrap;
}
.btn {
background: #1e2a47;
border: none;
padding: 8px 18px;
border-radius: 60px;
color: white;
font-weight: 500;
backdrop-filter: blur(8px);
cursor: pointer;
transition: all 0.2s ease;
font-size: 0.8rem;
display: inline-flex;
align-items: center;
gap: 6px;
border: 1px solid rgba(255,255,255,0.1);
}
.btn-primary {
background: #3b82f6;
box-shadow: 0 2px 8px #3b82f680;
}
.btn-primary:hover {
background: #2563eb;
transform: scale(1.02);
}
.voice-log {
background: #010409aa;
border-radius: 24px;
padding: 14px;
margin-top: 18px;
max-height: 200px;
overflow-y: auto;
font-family: monospace;
font-size: 0.8rem;
}
.log-entry {
border-bottom: 1px solid #2d3a5e;
padding: 6px 0;
color: #cbd5e6;
}
.command-list {
margin-top: 12px;
background: #00000030;
border-radius: 20px;
padding: 10px;
font-size: 0.7rem;
}
.footer-note {
margin-top: 24px;
text-align: center;
font-size: 0.7rem;
color: #5f6c8c;
}
@keyframes pulse {
0% { opacity: 0.6; }
100% { opacity: 1; text-shadow: 0 0 3px cyan; }
}
.listening {
color: #22d3ee;
animation: pulse 0.8s infinite;
}
hr {
border-color: #2d3748;
margin: 12px 0;
}
</style>
</head>
<body>
<div class=”demo-container”>
<h1>🎙️✋ 多模态神经浏览器 · 概念演示</h1>
<div class=”sub”>「摄像头手势 + 语音指令」自然交互 | 新一代智能浏览范型</div>

<div class=”sensor-grid”>
<!– 左侧: 摄像头手势识别区 –>
<div class=”camera-card”>
<div style=”display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;”>
<span style=”font-weight: 500;”>🖐️ 手势追踪 & 命令映射</span>
<span class=”command-badge” id=”handModelStatus”>⚡ 加载模型中…</span>
</div>
<div class=”video-wrapper”>
<video id=”webcam” autoplay muted playsinline></video>
<canvas id=”handCanvas” width=”640″ height=”480″></canvas>
</div>
<div class=”gesture-status”>
<span>🤟 当前手势识别</span>
<span class=”gesture-label” id=”gestureResult”>——</span>
<span>👉 触发动作: <span id=”lastAction”>无</span></span>
</div>
<div class=”btn-group”>
<button class=”btn” id=”enableCamBtn”>📷 开启摄像头</button>
<button class=”btn” id=”resetHandBtn”>🔄 重载手势模型</button>
</div>
<div class=”command-list”>
🎯 <strong>手势指令集</strong> (演示版) <br>
👍 <strong>大拇指竖起</strong> → 打开百度 · ✌️ <strong>V字手势</strong> → 打开Google <br>
✊ <strong>握拳</strong> → 页面滚动到底部 · 🖐️ <strong>手掌张开</strong> → 刷新当前页 <br>
👆 <strong>食指指向 + 稳定</strong> → 返回上一页 (体验未来感)
</div>
</div>

<!– 右侧: 语音控制区域 –>
<div class=”voice-card”>
<div style=”display: flex; justify-content: space-between; align-items: center;”>
<span style=”font-weight: 500;”>🎙️ 神经语音控制</span>
<button id=”voiceBtn” class=”btn btn-primary” style=”padding: 6px 18px;”>🎤 开始聆听</button>
</div>
<div style=”margin-top: 12px;”>
<div style=”background: #00000055; border-radius: 40px; padding: 8px 12px;”>
<span>✨ 指令状态: </span><strong id=”voiceStatus”>待命</strong>
</div>
<div style=”margin-top: 14px;”>
<span>📝 识别文本: </span>
<div id=”speechText” style=”background:#0F172A; border-radius: 20px; padding: 12px; margin-top: 6px; min-height: 55px; color:#b9e6ff;”>—</div>
</div>
</div>
<div class=”voice-log”>
<div>📋 交互日志 (手势/语音)</div>
<div id=”logContainer”>
<div class=”log-entry”>✨ 系统就绪，请允许摄像头和麦克风权限</div>
</div>
</div>
<div class=”command-list” style=”margin-top: 12px;”>
🗣️ <strong>语音命令示例</strong> (中英文混合)<br>
• “打开百度” / “搜索天气” (演示跳转百度并搜索关键词) <br>
• “刷新页面” / “滚动到底部” / “返回上一页”<br>
• “打开谷歌” / “打开GitHub”<br>
• “你好浏览器” (问候反馈)
</div>
</div>
</div>
<div class=”footer-note”>
⚡ 新一代多模态探索 | 手势基于 HandPose 模型 (实时) | 语音使用 Web Speech API | 演示镜像摄像头 | 交互式跳转/滚动/刷新
</div>
</div>

// ———- 全局变量 ———-
let handModel = null; // handpose 模型实例
let animationId = null;
let cameraActive = false;
let lastGestureExecuted = “”; // 避免同一手势连续触发 (防抖)
let lastExecTime = 0;
const GESTURE_COOLDOWN = 1200; // ms

// 辅助函数: 添加日志
function addLog(message, type = “info”) {
const div = document.createElement(‘div’);
div.className = ‘log-entry’;
const time = new Date().toLocaleTimeString();
div.innerHTML = `[${time}] ${message}`;
logContainer.prepend(div);
if(logContainer.children.length > 18) logContainer.removeChild(logContainer.lastChild);
if(type === ‘action’) {
lastActionSpan.innerText = message.split(‘→’)[1] || message;
}
}

// 执行浏览器动作 (核心交互)
async function executeAction(actionName, extra = null) {
const now = Date.now();
if(now – lastExecTime < 500) return; // 防止高频
lastExecTime = now;
addLog(`🎬 执行动作: ${actionName} ${extra ? ‘(‘+extra+’)’ : ”}`, ‘action’);
switch(actionName) {
case “open_baidu”:
window.open(‘https://www.baidu.com’, ‘_blank’);
addLog(`🌐 已在新标签页打开百度`, ‘action’);
break;
case “open_google”:
window.open(‘https://www.google.com’, ‘_blank’);
addLog(`🔍 打开 Google`, ‘action’);
break;
case “open_github”:
window.open(‘https://github.com’, ‘_blank’);
addLog(`🐙 打开 GitHub`, ‘action’);
break;
case “refresh”:
location.reload();
break;
case “scroll_bottom”:
window.scrollTo({ top: document.body.scrollHeight, behavior: ‘smooth’ });
addLog(`📜 滚动至页面底部`, ‘action’);
break;
case “back”:
history.back();
addLog(`◀️ 执行返回上一页`, ‘action’);
break;
case “search_weather”:
let query = extra || “今日天气”;
window.open(`https://www.baidu.com/s?wd=${encodeURIComponent(query)}`, ‘_blank’);
addLog(`☁️ 搜索: ${query}`, ‘action’);
break;
case “hello_browser”:
speak(“你好，新一代智能浏览器正在为您服务”);
addLog(`👋 语音问候反馈`, ‘action’);
break;
default:
if(actionName.startsWith(“search_”)) {
let kw = extra || “AI浏览器”;
window.open(`https://www.baidu.com/s?wd=${encodeURIComponent(kw)}`, ‘_blank’);
addLog(`🔎 智能搜索: ${kw}`, ‘action’);
}
}
}

// 语音合成反馈
function speak(text) {
if(!window.speechSynthesis) return;
const utterance = new SpeechSynthesisUtterance(text);
utterance.lang = ‘zh-CN’;
utterance.rate = 0.95;
window.speechSynthesis.cancel();
window.speechSynthesis.speak(utterance);
}

// ———- 手势核心逻辑: 基于handpose关键点规则识别五个常用手势 ———-
function recognizeGesture(landmarks) {
if (!landmarks || landmarks.length < 21) return “unknown”;
// 关键点索引 (0: wrist, 4: thumb_tip, 8: index_tip, 12: middle_tip, 16: ring_tip, 20: pinky_tip)
const thumbTip = landmarks[4];
const thumbMcp = landmarks[2];
const indexTip = landmarks[8];
const indexMcp = landmarks[5];
const middleTip = landmarks[12];
const middleMcp = landmarks[9];
const ringTip = landmarks[16];
const pinkyTip = landmarks[20];
const wrist = landmarks[0];

// 计算手指伸直 / 弯曲: 比较指尖与指根y坐标 (摄像头坐标系中y上小下大)
const thumbUp = (thumbTip[1] < thumbMcp[1] – 15); // 拇指竖直向上
const indexStraight = (indexTip[1] < indexMcp[1] – 20);
const middleStraight = (middleTip[1] < middleMcp[1] – 20);
const ringStraight = (ringTip[1] < landmarks[13][1] – 18);
const pinkyStraight = (pinkyTip[1] < landmarks[17][1] – 18);

// 握拳检测 (四个指尖低于对应关节)
const isFist = (!indexStraight && !middleStraight && !ringStraight && !pinkyStraight && !thumbUp);

// 手掌全开: 4指伸直且拇指外展 (拇指横向距离大)
const palmOpen = (indexStraight && middleStraight && ringStraight && pinkyStraight && (thumbTip[0] – wrist[0] > 60 || thumbTip[0] – landmarks[5][0] > 30));

// V字手势: 食指中指伸直，无名指和小指弯曲
const vGesture = (indexStraight && middleStraight && !ringStraight && !pinkyStraight);

// 大拇指竖起: 仅大拇指明显向上，其他四指弯曲或至少非全伸
const thumbOnly = (thumbUp && !indexStraight && !middleStraight && !ringStraight && !pinkyStraight);

// 食指指向 (类似点击), 只有食指伸直, 其他弯曲
const indexPoint = (indexStraight && !middleStraight && !ringStraight && !pinkyStraight && !thumbUp);

if (thumbOnly) return “thumbs_up”;
if (vGesture) return “victory”;
if (palmOpen) return “palm_open”;
if (isFist) return “fist”;
if (indexPoint) return “index_point”;
return “none”;
}

// 根据手势映射命令
function mapGestureToAction(gesture) {
switch(gesture) {
case “thumbs_up”: return “open_baidu”;
case “victory”: return “open_google”;
case “fist”: return “scroll_bottom”;
case “palm_open”: return “refresh”;
case “index_point”: return “back”;
default: return null;
}
}

// 手势触发带防抖和日志
let lastGesture = “”;
function handleGesture(gesture) {
if(gesture === “none” || gesture === “unknown”) return;
const now = Date.now();
if(gesture === lastGesture && (now – lastExecTime) < GESTURE_COOLDOWN) return;
lastGesture = gesture;
const action = mapGestureToAction(gesture);
if(action) {
let gestureName = gesture === “thumbs_up” ? “👍 大拇指竖起” : (gesture === “victory” ? “✌️ V手势” : (gesture === “fist” ? “✊ 握拳” : (gesture === “palm_open” ? “🖐️ 手掌打开” : “👉 食指指向”)));
addLog(`✋ 手势【${gestureName}】触发 → ${action}`, ‘action’);
executeAction(action);
}
}

// ———- 绘制手部关键点 & 调用识别 ———-
function drawHand(landmarks, ctx, videoWidth, videoHeight) {
if(!ctx) return;
ctx.clearRect(0, 0, canvas.width, canvas.height);
if(!landmarks) return;
// 绘制关键点与连线(简化骨架)
for(let i=0; i<landmarks.length; i++) {
const x = landmarks[i][0] * canvas.width;
const y = landmarks[i][1] * canvas.height;
ctx.beginPath();
ctx.arc(x, y, 4, 0, 2*Math.PI);
ctx.fillStyle = “#00ffff”;
ctx.fill();
ctx.shadowBlur = 0;
}
// 简易连线 (手部连线示例, 提升视觉)
const connections = [[0,1],[1,2],[2,3],[3,4],[0,5],[5,6],[6,7],[7,8],[5,9],[9,10],[10,11],[11,12],[9,13],[13,14],[14,15],[15,16],[13,17],[17,18],[18,19],[19,20],[0,17]];
ctx.beginPath();
ctx.strokeStyle = “#a5f0ff”;
ctx.lineWidth = 2;
for(let conn of connections) {
const [a,b] = conn;
if(landmarks[a] && landmarks[b]) {
const x1 = landmarks[a][0]*canvas.width, y1 = landmarks[a][1]*canvas.height;
const x2 = landmarks[b][0]*canvas.width, y2 = landmarks[b][1]*canvas.height;
ctx.moveTo(x1,y1);
ctx.lineTo(x2,y2);
ctx.stroke();
}
}
}

async function detectHand() {
if(!handModel || !cameraActive || !video.videoWidth) {
if(handModelStatusSpan) handModelStatusSpan.innerText = handModel ? “⚡ 等待摄像头” : “⌛ 模型未加载”;
requestAnimationFrame(detectHand);
return;
}
try {
const predictions = await handModel.estimateHands(video, false);
if(predictions.length > 0) {
const hand = predictions[0];
const landmarks = hand.landmarks; // 3D 点归一化坐标 [x,y,z] 0-1范围
if(landmarks) {
// 转换归一化坐标到canvas坐标系绘制
drawHand(landmarks, ctx, video.videoWidth, video.videoHeight);
const gesture = recognizeGesture(landmarks);
gestureSpan.innerText = gesture === “thumbs_up” ? “👍 拇指赞” : (gesture === “victory” ? “✌️ 胜利/V” : (gesture === “fist” ? “✊ 拳头” : (gesture === “palm_open” ? “🖐️ 手掌” : (gesture === “index_point” ? “👉 食指指向” : “🤚 其他”))));
handleGesture(gesture);
} else {
ctx.clearRect(0,0,canvas.width,canvas.height);
gestureSpan.innerText = “未识别手掌”;
}
} else {
ctx.clearRect(0,0,canvas.width,canvas.height);
gestureSpan.innerText = “无手势”;
}
} catch(err) {
console.warn(“手势检测错误”, err);
}
requestAnimationFrame(detectHand);
}

// 加载 handpose 模型
async function loadHandModel() {
handModelStatusSpan.innerText = “🔄 加载手势模型 (约5MB)…”;
try {
handModel = await handpose.load();
handModelStatusSpan.innerText = “✅ 手势模型就绪”;
addLog(“🧠 手势识别模型已加载，可用摄像头控制”, “info”);
} catch(e) {
console.error(e);
handModelStatusSpan.innerText = “❌ 模型加载失败，刷新重试”;
addLog(“⚠️ 手势模型加载失败，请检查网络”, “info”);
}
}

// ———- 摄像头初始化 (带镜像canvas适配)———-
async function initCamera() {
if(!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
addLog(“❌ 浏览器不支持摄像头访问”, “error”);
return false;
}
try {
const stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: false });
video.srcObject = stream;
await new Promise((resolve) => { video.onloadedmetadata = () => { resolve(); }; });
video.play();
// 适配canvas尺寸
const updateCanvasSize = () => {
if(video.videoWidth) {
canvas.width = video.videoWidth;
canvas.height = video.videoHeight;
} else {
canvas.width = 640;
canvas.height = 480;
}
};
updateCanvasSize();
video.addEventListener(‘resize’, updateCanvasSize);
cameraActive = true;
addLog(“📷 摄像头已激活，现在可以使用手势交互 (竖起拇指, V字等)”, “info”);
speak(“摄像头已开启，手势控制已就绪”);
return true;
} catch(err) {
addLog(`摄像头错误: ${err.message}`, “error”);
return false;
}
}

// ———- 语音识别模块 Web Speech ———-
let recognition = null;
let isListening = false;
function initSpeech() {
if (!(‘webkitSpeechRecognition’ in window) && !(‘SpeechRecognition’ in window)) {
voiceStatusSpan.innerText = “不支持语音”;
addLog(“⚠️ 当前浏览器不支持语音识别”, “error”);
return false;
}
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
recognition = new SpeechRecognition();
recognition.continuous = false;
recognition.interimResults = false;
recognition.lang = ‘zh-CN’;
recognition.onstart = () => {
isListening = true;
voiceStatusSpan.innerHTML = ‘🎤 聆听中… 说出指令’;
voiceStatusSpan.classList.add(‘listening’);
};
recognition.onend = () => {
isListening = false;
voiceStatusSpan.innerHTML = ‘待命’;
voiceStatusSpan.classList.remove(‘listening’);
};
recognition.onresult = (event) => {
const transcript = event.results[0][0].transcript;
speechTextDiv.innerText = transcript;
addLog(`🎤 语音: ${transcript}`, ‘info’);
processVoiceCommand(transcript);
};
recognition.onerror = (e) => {
voiceStatusSpan.innerHTML = ‘语音错误’;
addLog(`语音识别错误: ${e.error}`, ‘error’);
isListening = false;
};
return true;
}

// —–事件绑定—–
enableCamBtn.onclick = async () => {
await initCamera();
};
resetHandBtn.onclick = () => {
if(handModel) {
addLog(“🔄 手势模型重置”, “info”);
gestureSpan.innerText = “——”;
} else {
loadHandModel();
}
};
voiceBtn.onclick = toggleListening;

// 页面启动：加载模型，初始化语音预备，但不自动开摄像头 (需要用户点击 )
loadHandModel();
initSpeech(); // 预置语音识别能力
// 启动检测loop但不检测(等待摄像头)
(function initLoop() {
detectHand();
})();
addLog(“💡 点击「开启摄像头」允许权限，比出大拇指👍 打开百度，或点击🎤说出指令”, “info”);
// 优雅提示
if (Notification.permission === “default”) Notification.requestPermission();
</script>
</body>
</html>

关于多模态浏览器的思考

1 Comment

Leave a Reply to A WordPress Commenter Cancel reply