<title>Multimodal Media Analyzer</title>
* { box–sizing: border–box; margin: 0; padding: 0; }
body {
font–family: system–ui, sans–serif;
max–width: 820px;
margin: 0 auto;
padding: 1.5rem 1rem;
background: #f1f5f9;
color: #1e293b;
}
header { margin–bottom: 1.5rem; }
header h1 { font–size: 1.5rem; }
header p { color: #64748b; font-size: 0.9rem; margin-top: 0.2rem; }
/* Model status indicators */
.model–status–bar {
display: flex;
gap: 0.5rem;
flex–wrap: wrap;
margin–top: 0.75rem;
}
.model–badge {
font–size: 0.78rem;
padding: 0.2rem 0.6rem;
border–radius: 12px;
background: #fef3c7;
color: #92400e;
}
.model–badge.ready { background: #dcfce7; color: #15803d; }
/* Tab bar */
.tabs {
display: flex;
background: white;
border–radius: 8px;
padding: 0.25rem;
gap: 0.25rem;
margin–bottom: 1.25rem;
border: 1px solid #e2e8f0;
}
.tab {
flex: 1;
padding: 0.5rem;
text–align: center;
border–radius: 6px;
cursor: pointer;
font–size: 0.9rem;
color: #64748b;
transition: all 0.15s;
}
.tab.active { background: #2563eb; color: white; font-weight: 600; }
/* Input panels */
.panel { display: none; }
.panel.active { display: block; }
.upload–area {
background: white;
border: 2px dashed #cbd5e1;
border–radius: 8px;
padding: 2rem;
text–align: center;
cursor: pointer;
}
.upload–area input { display: none; }
#img-preview {
margin–top: 1rem;
max–width: 100%;
max–height: 320px;
border–radius: 8px;
display: none;
object–fit: cover;
}
.mic–center { text–align: center; padding: 1rem 0; }
#rec-btn {
width: 72px; height: 72px;
border–radius: 50%; border: none;
background: #dc2626; color: white;
font–size: 1.6rem; cursor: pointer;
display: flex; align–items: center; justify–content: center;
margin: 0 auto 0.5rem;
}
#rec-btn.recording { background: #374151; }
#rec-btn:disabled { background: #94a3b8; cursor: not-allowed; }
#rec-timer { font-weight: 600; color: #374151; margin-bottom: 0.25rem; }
#rec-hint { font-size: 0.85rem; color: #64748b; }
#wave-canvas { display: block; margin: 0.5rem auto; border-radius: 4px; }
/* Results grid */
.results–grid {
display: grid;
grid–template–columns: repeat(auto–fit, minmax(220px, 1fr));
gap: 1rem;
margin–top: 1.25rem;
}
.result–card {
background: white;
border: 1px solid #e2e8f0;
border–radius: 8px;
padding: 1rem;
}
.result–card h3 {
font–size: 0.75rem;
text–transform: uppercase;
letter–spacing: 0.06em;
color: #64748b;
margin–bottom: 0.6rem;
}
.label–item {
display: flex;
justify–content: space–between;
align–items: center;
padding: 0.25rem 0;
font–size: 0.875rem;
border–bottom: 1px solid #f1f5f9;
}
.label–score {
font–size: 0.8rem;
color: #64748b;
background: #f1f5f9;
padding: 0.1rem 0.4rem;
border–radius: 4px;
}
.caption–body {
font–size: 0.95rem;
line–height: 1.5;
font–style: italic;
color: #334155;
}
.transcript–body {
font–size: 0.95rem;
line–height: 1.6;
color: #334155;
white–space: pre–wrap;
}
.placeholder–text { color: #94a3b8; font-style: italic; font-size: 0.9rem; }
#global-status {
font–size: 0.85rem;
color: #64748b;
margin–bottom: 1rem;
}
@media (max–width: 500px) {
.results–grid { grid–template–columns: 1fr; }
}
<header>
<h1>Multimodal Media Analyzer</h1>
<p>Image classification, captioning, and speech transcription — all in your browser.</p>
<div class=“model-status-bar”>
<span class=“model-badge” id=“badge-cls”>Classifier: loading...</span>
<span class=“model-badge” id=“badge-cap”>Captioner: loading...</span>
<span class=“model-badge” id=“badge-asr”>Whisper: loading...</span>
</div>
</header>
<div id=“global-status”>Loading models in parallel — first run downloads ~400 MB total.</div>
<div class=“tabs”>
<div class=“tab active” data–tab=“image”>🖼 Image Analysis</div>
<div class=“tab” data–tab=“speech”>🎙 Speech Transcription</div>
</div>
<!— Image panel —>
<div class=“panel active” id=“panel-image”>
<div class=“upload-area” id=“img-drop”>
<p>Click or drag an image to analyze</p>
<p style=“font-size:0.8rem;color:#94a3b8;margin-top:0.3rem”>
JPG, PNG, WebP, GIF supported
</p>
</div>
<img id=“img-preview” alt=“Preview” />
</div>
<!— Speech panel —>
<div class=“panel” id=“panel-speech”>
<div class=“mic-center”>
<button id=“rec-btn” disabled>🎙</button>
<div id=“rec-timer”>0:00</div>
<div id=“rec-hint”>Waiting for Whisper model...</div>
</div>
</div>
<!— Results – shown for both modes —>
<div class=“results-grid” id=“results-grid” style=“display:none”>
<!— Image results (shown in image mode) —>
<div class=“result-card” id=“card-cls” style=“display:none”>
<h3>Classification</h3>
<div id=“cls-content”>
<p class=“placeholder-text”>No results yet.</p>
</div>
</div>
<div class=“result-card” id=“card-cap” style=“display:none”>
<h3>Caption</h3>
<div id=“cap-content”>
<p class=“placeholder-text”>No caption yet.</p>
</div>
</div>
<!— Speech results (shown in speech mode) —>
<div class=“result-card” id=“card-asr” style=“display:none”>
<h3>Transcription</h3>
<div id=“asr-content”>
<p class=“placeholder-text”>Record audio to see the transcription.</p>
</div>
</div>
</div>
import { pipeline }
from ‘https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2’;
// ── Pipeline references ───────────────────────────────────────────────
let classifier, captioner, transcriber;
let readyCount = 0;
// Update a model badge to “ready” state
function markReady(badgeId, label) {
const badge = document.getElementById(badgeId);
badge.textContent = `${label}: ready`;
badge.classList.add(‘ready’);
readyCount++;
if (readyCount === 3) {
globalStatus.textContent =
‘All models ready. Upload an image or record audio.’;
recBtn.disabled = false;
recHint.textContent = ‘Click to start recording.’;
}
}
// Load all three pipelines simultaneously
Promise.all([
pipeline(‘image-classification’, ‘Xenova/vit-base-patch16-224’, {
dtype: ‘q8’,
progress_callback: p => p.status === ‘done’ && markReady(‘badge-cls’, ‘Classifier’)
}),
pipeline(‘image-to-text’, ‘Xenova/vit-gpt2-image-captioning’, {
dtype: ‘q8’,
progress_callback: p => p.status === ‘done’ && markReady(‘badge-cap’, ‘Captioner’)
}),
pipeline(‘automatic-speech-recognition’, ‘Xenova/whisper-tiny.en’, {
dtype: ‘q8’,
progress_callback: p => p.status === ‘done’ && markReady(‘badge-asr’, ‘Whisper’)
})
]).then(([cls, cap, asr]) => {
classifier = cls;
captioner = cap;
transcriber = asr;
}).catch(err => {
globalStatus.textContent = `Error loading models: ${err.message}`;
});
// ── UI references ─────────────────────────────────────────────────────
const globalStatus = document.getElementById(‘global-status’);
const resultsGrid = document.getElementById(‘results-grid’);
const recBtn = document.getElementById(‘rec-btn’);
const recHint = document.getElementById(‘rec-hint’);
const recTimer = document.getElementById(‘rec-timer’);
const waveCanvas = document.getElementById(‘wave-canvas’);
const waveCtx = waveCanvas.getContext(‘2d’);
// ── Image analysis ────────────────────────────────────────────────────
async function analyzeImage(dataUrl) {
if (!classifier || !captioner) {
globalStatus.textContent = ‘Models still loading. Please wait.’;
return;
}
globalStatus.textContent = ‘Running classification and captioning…’;
// Show image result cards, hide speech card
document.getElementById(‘card-cls’).style.display = ‘block’;
document.getElementById(‘card-cap’).style.display = ‘block’;
document.getElementById(‘card-asr’).style.display = ‘none’;
resultsGrid.style.display = ‘grid’;
document.getElementById(‘cls-content’).innerHTML =
‘<p class=”placeholder-text”>Classifying…</p>’;
document.getElementById(‘cap-content’).innerHTML =
‘<p class=”placeholder-text”>Generating caption…</p>’;
try {
// Run classification and captioning in parallel
const [classResults, captionResults] = await Promise.all([
classifier(dataUrl, { top_k: 4 }),
captioner(dataUrl, { max_new_tokens: 60 })
]);
// Render classification labels
document.getElementById(‘cls-content’).innerHTML =
classResults.map(({ label, score }) => `
<div class=“label-item”>
<span>${label}</span>
<span class=“label-score”>${(score * 100).toFixed(1)}%</span>
</div>`).join(”);
// Render generated caption
document.getElementById(‘cap-content’).innerHTML =
`<p class=“caption-body”>“${captionResults[0]?.generated_text ?? ‘No caption.’}”</p>`;
globalStatus.textContent = ‘Analysis complete.’;
} catch (err) {
globalStatus.textContent = `Error: ${err.message}`;
}
}
// File upload handler for images
const imgDrop = document.getElementById(‘img-drop’);
const imgInput = document.getElementById(‘img-input’);
const imgPrev = document.getElementById(‘img-preview’);
function handleImageFile(file) {
if (!file?.type.startsWith(‘image/’)) return;
const reader = new FileReader();
reader.onload = e => {
imgPrev.src = e.target.result;
imgPrev.style.display = ‘block’;
analyzeImage(e.target.result);
};
reader.readAsDataURL(file);
}
imgDrop.addEventListener(‘click’, () => imgInput.click());
imgInput.addEventListener(‘change’, e => handleImageFile(e.target.files[0]));
imgDrop.addEventListener(‘dragover’, e => e.preventDefault());
imgDrop.addEventListener(‘drop’, e => {
e.preventDefault();
handleImageFile(e.dataTransfer.files[0]);
});
// ── Audio decoding helper ─────────────────────────────────────────────
async function decodeAudio(arrayBuffer) {
const audioCtx = new AudioContext({ sampleRate: 16000 });
const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
return audioBuffer.getChannelData(0); // Mono Float32Array at 16kHz
}
// ── Speech transcription ──────────────────────────────────────────────
async function runTranscription(audioData) {
// Show speech result card, hide image cards
document.getElementById(‘card-cls’).style.display = ‘none’;
document.getElementById(‘card-cap’).style.display = ‘none’;
document.getElementById(‘card-asr’).style.display = ‘block’;
resultsGrid.style.display = ‘grid’;
document.getElementById(‘asr-content’).innerHTML =
‘<p class=”placeholder-text”>Transcribing…</p>’;
globalStatus.textContent = ‘Running Whisper transcription…’;
try {
const result = await transcriber(audioData, {
chunk_length_s: 30,
stride_length_s: 5
});
document.getElementById(‘asr-content’).innerHTML =
`<p class=“transcript-body”>${result.text.trim()}</p>`;
globalStatus.textContent = ‘Transcription complete.’;
} catch (err) {
globalStatus.textContent = `Error: ${err.message}`;
}
}
// ── Microphone recording ──────────────────────────────────────────────
let mediaRecorder, audioChunks = [], timerInterval, analyserNode, animId;
let secs = 0;
function drawWave() {
const buf = new Uint8Array(analyserNode.frequencyBinCount);
analyserNode.getByteTimeDomainData(buf);
waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.height);
waveCtx.beginPath();
waveCtx.strokeStyle = ‘#2563eb’;
waveCtx.lineWidth = 1.5;
buf.forEach((v, i) => {
const x = (i / buf.length) * waveCanvas.width;
const y = (v / 128.0) * (waveCanvas.height / 2);
i === 0 ? waveCtx.moveTo(x, y) : waveCtx.lineTo(x, y);
});
waveCtx.stroke();
animId = requestAnimationFrame(drawWave);
}
recBtn.addEventListener(‘click’, async () => {
if (mediaRecorder?.state === ‘recording’) {
mediaRecorder.stop();
recBtn.classList.remove(‘recording’);
recBtn.textContent = ‘🎙’;
clearInterval(timerInterval);
cancelAnimationFrame(animId);
waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.height);
recHint.textContent = ‘Processing…’;
} else {
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const actx = new AudioContext();
analyserNode = actx.createAnalyser();
actx.createMediaStreamSource(stream).connect(analyserNode);
analyserNode.fftSize = 256;
mediaRecorder = new MediaRecorder(stream);
audioChunks = [];
mediaRecorder.ondataavailable = e => e.data.size && audioChunks.push(e.data);
mediaRecorder.onstop = async () => {
const blob = new Blob(audioChunks, { type: ‘audio/webm’ });
const arrayBuffer = await blob.arrayBuffer();
const audioData = await decodeAudio(arrayBuffer);
stream.getTracks().forEach(t => t.stop());
await runTranscription(audioData);
recHint.textContent = ‘Click to record again.’;
};
mediaRecorder.start();
recBtn.classList.add(‘recording’);
recBtn.textContent = ‘⏹’;
secs = 0;
recTimer.textContent = ‘0:00’;
timerInterval = setInterval(() => {
secs++;
recTimer.textContent =
`${Math.floor(secs / 60)}:${String(secs % 60).padStart(2, ‘0’)}`;
}, 1000);
recHint.textContent = ‘Recording… click to stop.’;
drawWave();
} catch (err) {
recHint.textContent = `Mic error: ${err.message}`;
}
}
});
// ── Tab switching ─────────────────────────────────────────────────────
document.querySelectorAll(‘.tab’).forEach(tab => {
tab.addEventListener(‘click’, () => {
document.querySelectorAll(‘.tab, .panel’).forEach(el =>
el.classList.remove(‘active’));
tab.classList.add(‘active’);
document.getElementById(`panel–${tab.dataset.tab}`).classList.add(‘active’);
});
});
