Back to all AI tutorials
Oct 17, 2025 - 20 min read
Extract Structured Data From PDF Images With Vision Models

Extract Structured Data From PDF Images With Vision Models

Render PDF pages as screenshots and use OpenAI's vision models with structured outputs to extract typed invoice data from scanned documents and complex layouts

Patrick the AI Engineer

Patrick the AI Engineer

Try the Interactive Playground

This tutorial is accompanied by an interactive playground. Test the code, experiment with different parameters, and see the results in real-time.

Go to Playground

Sometimes PDFs are scanned documents or have complex layouts that text extraction can't handle well. You can render PDF pages as images and send them directly to vision models instead. This works for invoices, receipts, forms—anything where visual layout matters.

We're going to build a tool that converts PDF pages to screenshots, then uses OpenAI's vision model with structured outputs to extract invoice data into a typed object.

Rendering PDF Pages as Images

Start with a server endpoint that accepts a PDF and returns page screenshots. We'll use pdf-parse which includes screenshot capabilities.

import { defineEventHandler, readMultipartFormData } from 'h3'
import { PDFParse } from 'pdf-parse'

export default defineEventHandler(async (event) => {
  const form = await readMultipartFormData(event)
  const file = form.find(f => f.name === 'file' && 'data' in f)
  
  const buffer = file.data as Buffer
  const parser = new PDFParse({ data: buffer })
  const screenshotResult = await parser.getScreenshot()
  await parser.destroy()
})

The getScreenshot() method returns page data as Uint8Arrays. We need to convert these to base64 strings for easy transport to the browser.

export default defineEventHandler(async (event) => {
  const form = await readMultipartFormData(event)
  const file = form.find(f => f.name === 'file' && 'data' in f)
  
  const buffer = file.data as Buffer
  const parser = new PDFParse({ data: buffer })
  const screenshotResult = await parser.getScreenshot()
  await parser.destroy()
  
  const pages = screenshotResult.pages.map(p => ({    pageNumber: p.pageNumber,    base64Png: Buffer.from(p.data).toString('base64')   }))   
  return { pages }})

Each page comes back with its number and the PNG data as a base64 string. The browser can turn these into image URLs with data:image/png;base64,${base64}.

Add error handling:

export default defineEventHandler(async (event) => {
  try {    const form = await readMultipartFormData(event)
    if (!form || form.length === 0)       return { error: 'No form data received' }    
    const file = form.find(f => f.name === 'file' && 'data' in f)
    if (!file || !('data' in file))       throw new Error('No file uploaded')     
    const buffer = file.data as Buffer
    const parser = new PDFParse({ data: buffer })
    const screenshotResult = await parser.getScreenshot()
    await parser.destroy()
    
    const pages = screenshotResult.pages.map(p => ({
      pageNumber: p.pageNumber,
      base64Png: Buffer.from(p.data).toString('base64')
    }))
    
    return { pages }
  } catch (err) {    throw createError({      statusCode: 500,      statusMessage: 'Failed to render PDF screenshots',      data: err instanceof Error ? err.message : 'Unknown error'    })   }})

That's the server side. Save this as server/api/pdf/screenshot.post.ts.

Uploading and Previewing Pages

Now build the client side. Start with file upload and status display.

<script setup lang="ts">
import { ref } from 'vue'

const status = ref('Ready - Upload a PDF to render images')
const extracting = ref(false)
const imagePages = ref<Array<{ pageNumber: number, base64Png: string }>>([])

function handleFileUpload(e: Event) {
  const file = (e.target as HTMLInputElement).files?.[0]
  if (!file) return
  
  const isPdf = file.type === 'application/pdf'
  if (!isPdf) {
    status.value = 'Please select a PDF file.'
    return
  }
  
  void renderPdfImages(file)
}
</script>

The renderPdfImages function sends the file to our endpoint:

async function renderPdfImages(file: File) {
  extracting.value = true
  imagePages.value = []
  
  const form = new FormData()
  form.append('file', file)
  const res = await fetch('/api/pdf/screenshot', { 
    method: 'POST', 
    body: form 
  })
  
  const json = await res.json()
  if (!res.ok || json.error) {
    status.value = 'Failed to render images from PDF'
    return
  }
  
  imagePages.value = json.pages || []
  status.value = `Rendered ${imagePages.value.length} page image(s)`
  extracting.value = false
}

Add a page selector so users can pick which page to analyze:

<script setup lang="ts">
import { ref, computed } from 'vue'

const imagePages = ref<Array<{ pageNumber: number, base64Png: string }>>([])
const selectedPageIndex = ref(0)

const currentImageUrl = computed(() => {
  const page = imagePages.value[selectedPageIndex.value]
  if (!page) return null
  return `data:image/png;base64,${page.base64Png}`
})
</script>

<template>
  <select v-model.number="selectedPageIndex">
    <option v-for="(p, idx) in imagePages" :value="idx">
      Page {{ p.pageNumber }}
    </option>
  </select>
  
  <img v-if="currentImageUrl" :src="currentImageUrl" alt="PDF page" />
</template>

The computed property converts the selected page's base64 string into a data URL the browser can display.

Defining the Invoice Schema

Before we send images to the LLM, we need a schema. This is identical to text-based extraction—the schema doesn't care whether you're sending text or images.

import { z } from 'zod'

const invoiceSchema = z.object({
  invoiceNumber: z.string().min(1).describe('Invoice identifier number'),
  invoiceDate: z.string().min(1).describe('Invoice date'),
  vendorName: z.string().min(1).describe('Vendor/seller name'),
  total: z.number().describe('Total amount'),
  lineItems: z.array(z.object({
    description: z.string().describe('Item description'),
    quantity: z.number().optional().describe('Quantity'),
    unitPrice: z.number().optional().describe('Unit price'),
    amount: z.number().optional().describe('Line item total')
  })).optional().describe('Invoice line items')
})

type InvoiceData = z.infer<typeof invoiceSchema>

Add optional fields for addresses, dates, and payment terms:

const invoiceSchema = z.object({
  invoiceNumber: z.string().min(1).describe('Invoice identifier number'),
  invoiceDate: z.string().min(1).describe('Invoice date'),
  dueDate: z.string().optional().describe('Payment due date'),  vendorName: z.string().min(1).describe('Vendor/seller name'),
  vendorAddress: z.string().optional().describe('Vendor address'),  customerName: z.string().optional().describe('Customer/buyer name'),  customerAddress: z.string().optional().describe('Customer address'),  currency: z.string().optional().describe('Currency code'),  subtotal: z.number().optional().describe('Subtotal amount before tax'),  tax: z.number().optional().describe('Tax amount'),  total: z.number().describe('Total amount'),
  purchaseOrderNumber: z.string().optional().describe('Purchase order number'),  paymentTerms: z.string().optional().describe('Payment terms'),  lineItems: z.array(z.object({
    description: z.string().describe('Item description'),
    quantity: z.number().optional().describe('Quantity'),
    unitPrice: z.number().optional().describe('Unit price'),
    amount: z.number().optional().describe('Line item total')
  })).optional().describe('Invoice line items')
})

The .describe() calls tell the LLM what each field represents. Optional fields get .optional() so the model knows it can skip them if they're not in the document.

Sending Images to OpenAI

Now we can send the selected page image to OpenAI's vision model with structured output. Start with the API setup:

import { generateObject } from 'ai'
import { createOpenAI } from '@ai-sdk/openai'

const analyzing = ref(false)
const analysis = ref<InvoiceData | null>(null)

async function analyzeInvoiceFromImage() {
  analyzing.value = true
  
  const openai = createOpenAI({ apiKey: openaiKey.value })
}

Vision models in the AI SDK accept multimodal messages with text and file content. We need to convert the base64 string to a Uint8Array:

async function analyzeInvoiceFromImage() {
  analyzing.value = true
  
  const openai = createOpenAI({ apiKey: openaiKey.value })
  
  const base64 = imagePages.value[selectedPageIndex.value]?.base64Png || ''  const binary = atob(base64)   const bytes = new Uint8Array(binary.length)   for (let i = 0; i < binary.length; i++) {    bytes[i] = binary.charCodeAt(i)   }}

The atob() function decodes base64 to a binary string, then we convert each character to its byte value. Now we can pass this to generateObject:

async function analyzeInvoiceFromImage() {
  analyzing.value = true
  
  const openai = createOpenAI({ apiKey: openaiKey.value })
  
  const base64 = imagePages.value[selectedPageIndex.value]?.base64Png || ''
  const binary = atob(base64)
  const bytes = new Uint8Array(binary.length)
  for (let i = 0; i < binary.length; i++) {
    bytes[i] = binary.charCodeAt(i)
  }
  
  const { object } = await generateObject({    model: openai('gpt-5-mini'),    messages: [       {        role: 'user',        content: [           { type: 'text', text: 'Extract structured invoice data from the following invoice image.' },          { type: 'file', data: bytes, mediaType: 'image/png' }        ]       }    ],    schema: invoiceSchema  })   
  analysis.value = object  analyzing.value = false}

The message content array has two parts: a text instruction and the image file. The AI SDK handles sending this in the format OpenAI expects. The schema parameter ensures the response matches our Zod schema.

Add a system prompt and temperature setting:

const { object } = await generateObject({
  model: openai('gpt-5-mini'),
  system: 'You are a precise invoice data extraction assistant. Extract all available invoice information accurately. If a field is not present in the document, omit it.',  messages: [
    {
      role: 'user',
      content: [
        { type: 'text', text: 'Extract structured invoice data from the following invoice image.' },
        { type: 'file', data: bytes, mediaType: 'image/png' }
      ]
    }
  ],
  schema: invoiceSchema,
  temperature: 0})

Temperature 0 makes the model deterministic. The same invoice image will produce the same structured output every time.

Displaying Results

The returned object is fully typed as InvoiceData. Display it as formatted JSON:

<template>
  <button 
    :disabled="analyzing || !currentImageUrl" 
    @click="analyzeInvoiceFromImage"
  >
    {{ analyzing ? 'Analyzing...' : 'Analyze Invoice' }}
  </button>
  
  <pre v-if="analysis">{{ JSON.stringify(analysis, null, 2) }}</pre>
</template>

Add error handling to show when extraction fails:

const analysisError = ref<string | null>(null)

async function analyzeInvoiceFromImage() {
  analyzing.value = true
  analysisError.value = null  
  try {    const openai = createOpenAI({ apiKey: openaiKey.value })
    
    const base64 = imagePages.value[selectedPageIndex.value]?.base64Png || ''
    const binary = atob(base64)
    const bytes = new Uint8Array(binary.length)
    for (let i = 0; i < binary.length; i++) {
      bytes[i] = binary.charCodeAt(i)
    }
    
    const { object } = await generateObject({
      model: openai('gpt-5-mini'),
      system: 'You are a precise invoice data extraction assistant.',
      messages: [
        {
          role: 'user',
          content: [
            { type: 'text', text: 'Extract structured invoice data from the following invoice image.' },
            { type: 'file', data: bytes, mediaType: 'image/png' }
          ]
        }
      ],
      schema: invoiceSchema,
      temperature: 0
    })
    
    analysis.value = object
  } catch (err) {    analysisError.value = err instanceof Error ? err.message : 'Failed to analyze invoice.'    analysis.value = null  } finally {    analyzing.value = false
  }}

Display the error if extraction fails:

<template>
  <button 
    :disabled="analyzing || !currentImageUrl" 
    @click="analyzeInvoiceFromImage"
  >
    {{ analyzing ? 'Analyzing...' : 'Analyze Invoice' }}
  </button>
  
  <div v-if="analysisError" class="error">{{ analysisError }}</div>  
  <pre v-if="analysis">{{ JSON.stringify(analysis, null, 2) }}</pre>
</template>

That's the core implementation. Upload a PDF, it renders to images, pick a page, click analyze, and get structured data back.

When to Use Image vs Text Extraction

Vision models are slower and more expensive than text extraction. For clean, digital PDFs with selectable text, use text extraction with pdf-parse's getText() method. It's faster and cheaper.

Use image extraction when:

  • The PDF is a scanned document (no text layer)
  • Layout matters (tables, forms with boxes and lines)
  • Text extraction produces garbled output
  • You need to read handwriting or stamps

Some invoices have both typed text and handwritten notes. Vision models can read both, while text extraction only gets the typed parts.

Cost and Performance

Vision model calls cost more than text-based extraction. GPT-5 Mini charges per image based on size. A single PDF page costs roughly $0.01-0.03 depending on resolution. If you're processing hundreds of documents, the costs add up quickly.

The server endpoint renders all pages at once. For multi-page PDFs, you could modify it to render only the page the user selects, saving memory. Or render at lower DPI if you don't need full resolution.

// Lower DPI reduces file size and cost
const screenshotResult = await parser.getScreenshot({ 
  scale: 1.0  // default is 2.0
})

Lower scale means smaller images, which are faster to upload and cheaper to analyze, but text might be harder for the model to read.

Handling Multi-Page Invoices

Some invoices span multiple pages. You can analyze each page separately and merge the results, or concatenate multiple pages into a single request if the model's context window supports it.

To analyze multiple pages at once, modify the function to accept an array of page indices:

async function analyzeMultiplePages(pageIndices: number[]) {
  const content: Array<{ type: 'text' | 'file', text?: string, data?: Uint8Array, mediaType?: string }> = [
    { type: 'text', text: 'Extract structured invoice data from the following invoice pages.' }
  ]
  
  for (const idx of pageIndices) {
    const base64 = imagePages.value[idx]?.base64Png || ''
    const binary = atob(base64)
    const bytes = new Uint8Array(binary.length)
    for (let i = 0; i < binary.length; i++) {
      bytes[i] = binary.charCodeAt(i)
    }
    content.push({ type: 'file', data: bytes, mediaType: 'image/png' })
  }
  
  const { object } = await generateObject({
    model: openai('gpt-5-mini'),
    messages: [{ role: 'user', content }],
    schema: invoiceSchema,
    temperature: 0
  })
  
  return object
}

This sends multiple images in one request. The model sees all pages and extracts data from across them. It's more accurate than analyzing pages separately and merging manually.

Wrapping Up

We built a PDF invoice extractor that works on scanned documents and complex layouts. The server renders PDF pages as screenshots, the client sends selected pages to OpenAI's vision model, and structured output gives us typed data we can use directly in our app.

<template>
  <div>
    <input
      type="file"
      accept="application/pdf"
      @change="handleFileUpload"
    >

    <label>{{ status }}</label>

    <div v-if="!imagePages.length">
      <p>Upload a PDF to get started</p>
    </div>

    <div v-else>
      <div>
        <label>Select page:</label>
        <select v-model.number="selectedPageIndex">
          <option
            v-for="(p, idx) in imagePages"
            :key="p.pageNumber"
            :value="idx"
          >
            Page {{ p.pageNumber }}
          </option>
        </select>
      </div>

      <img
        v-if="currentImageUrl"
        :src="currentImageUrl"
        alt="PDF page screenshot"
      >

      <div>
        <button
          :disabled="analyzing || !currentImageUrl || !openaiKey"
          @click="analyzeInvoiceFromImage"
        >
          {{ analyzing ? 'Analyzing...' : 'Analyze Invoice' }}
        </button>
      </div>

      <div v-if="analysisError" class="error">
        {{ analysisError }}
      </div>

      <pre v-if="analysis">{{ JSON.stringify(analysis, null, 2) }}</pre>
    </div>
  </div>
</template>

<script setup lang="ts">
import { ref, computed } from 'vue'
import { generateObject } from 'ai'
import { createOpenAI } from '@ai-sdk/openai'
import { z } from 'zod'

const status = ref('Ready - Upload a PDF to render images')
const extracting = ref(false)
const analyzing = ref(false)
const analysisError = ref<string | null>(null)
const analysis = ref<InvoiceData | null>(null)
const imagePages = ref<Array<{ pageNumber: number, base64Png: string }>>([])
const selectedPageIndex = ref(0)

// Replace with your API key management
const openaiKey = ref(process.env.OPENAI_API_KEY || '')

const currentImageUrl = computed(() => {
  const page = imagePages.value[selectedPageIndex.value]
  if (!page) return null
  return `data:image/png;base64,${page.base64Png}`
})

const invoiceSchema = z.object({
  invoiceNumber: z.string().min(1).describe('Invoice identifier number'),
  invoiceDate: z.string().min(1).describe('Invoice date'),
  dueDate: z.string().optional().describe('Payment due date'),
  vendorName: z.string().min(1).describe('Vendor/seller name'),
  vendorAddress: z.string().optional().describe('Vendor address'),
  customerName: z.string().optional().describe('Customer/buyer name'),
  customerAddress: z.string().optional().describe('Customer address'),
  currency: z.string().optional().describe('Currency code'),
  subtotal: z.number().optional().describe('Subtotal amount before tax'),
  tax: z.number().optional().describe('Tax amount'),
  total: z.number().describe('Total amount'),
  purchaseOrderNumber: z.string().optional().describe('Purchase order number'),
  paymentTerms: z.string().optional().describe('Payment terms'),
  lineItems: z.array(z.object({
    description: z.string().describe('Item description'),
    quantity: z.number().optional().describe('Quantity'),
    unitPrice: z.number().optional().describe('Unit price'),
    amount: z.number().optional().describe('Line item total')
  })).optional().describe('Invoice line items')
})
type InvoiceData = z.infer<typeof invoiceSchema>

function handleFileUpload(e: Event) {
  const file = (e.target as HTMLInputElement).files?.[0]
  if (!file) return

  const isPdf = file.type === 'application/pdf' || file.name.toLowerCase().endsWith('.pdf')
  if (!isPdf) {
    status.value = 'Please select a PDF file.'
    return
  }

  void renderPdfImages(file)
}

async function renderPdfImages(file: File) {
  try {
    analysisError.value = null
    extracting.value = true
    analysis.value = null
    imagePages.value = []

    const form = new FormData()
    form.append('file', file)
    const res = await fetch('/api/pdf/screenshot', { method: 'POST', body: form })
    const json = await res.json() as { pages?: Array<{ pageNumber: number, base64Png: string }>, error?: string }
    
    if (!res.ok || json.error) {
      status.value = 'Failed to render images from PDF'
      return
    }
    
    imagePages.value = json.pages || []
    selectedPageIndex.value = 0
    status.value = `Rendered ${imagePages.value.length} page image(s)`
  } catch {
    status.value = 'Failed to render images from PDF'
  } finally {
    extracting.value = false
  }
}

async function analyzeInvoiceFromImage() {
  if (!currentImageUrl.value) {
    analysisError.value = 'No image available.'
    return
  }
  if (!openaiKey.value) {
    analysisError.value = 'OpenAI API key not set.'
    return
  }
  
  analyzing.value = true
  analysisError.value = null
  
  try {
    const openai = createOpenAI({ apiKey: openaiKey.value })

    const base64 = imagePages.value[selectedPageIndex.value]?.base64Png || ''
    const binary = atob(base64)
    const bytes = new Uint8Array(binary.length)
    for (let i = 0; i < binary.length; i++) {
      bytes[i] = binary.charCodeAt(i)
    }

    const { object } = await generateObject({
      model: openai('gpt-5-mini'),
      system: 'You are a precise invoice data extraction assistant. Extract all available invoice information accurately. If a field is not present in the document, omit it.',
      messages: [
        {
          role: 'user',
          content: [
            { type: 'text', text: 'Extract structured invoice data from the following invoice image.' },
            { type: 'file', data: bytes, mediaType: 'image/png' }
          ]
        }
      ],
      schema: invoiceSchema,
      temperature: 0
    })

    analysis.value = object
  } catch (err) {
    analysisError.value = err instanceof Error ? err.message : 'Failed to analyze invoice.'
    analysis.value = null
  } finally {
    analyzing.value = false
  }
}
</script>
Copyright © 2025