mirror of
https://github.com/CorentinTh/it-tools.git
synced 2025-05-05 13:57:10 -04:00
feat(new tool): OCR Image
OCR an image with Tesseract.js
This commit is contained in:
parent
a07806cd15
commit
0d17698bb0
8 changed files with 669 additions and 18 deletions
|
@ -1,7 +1,8 @@
|
|||
import { useRouteQuery } from '@vueuse/router';
|
||||
import { computed } from 'vue';
|
||||
import { useStorage } from '@vueuse/core';
|
||||
|
||||
export { useQueryParam };
|
||||
export { useQueryParam, useQueryParamOrStorage };
|
||||
|
||||
const transformers = {
|
||||
number: {
|
||||
|
@ -33,3 +34,31 @@ function useQueryParam<T>({ name, defaultValue }: { name: string; defaultValue:
|
|||
},
|
||||
});
|
||||
}
|
||||
|
||||
function useQueryParamOrStorage<T>({ name, storageName, defaultValue }: { name: string; storageName: string; defaultValue?: T }) {
|
||||
const type = typeof defaultValue;
|
||||
const transformer = transformers[type as keyof typeof transformers] ?? transformers.string;
|
||||
|
||||
const storageRef = useStorage(storageName, defaultValue);
|
||||
const storageDefaultValue = storageRef.value ?? defaultValue;
|
||||
|
||||
const proxy = useRouteQuery(name, transformer.toQuery(storageDefaultValue as never));
|
||||
|
||||
const ref = computed<T>({
|
||||
get() {
|
||||
return transformer.fromQuery(proxy.value) as unknown as T;
|
||||
},
|
||||
set(value) {
|
||||
proxy.value = transformer.toQuery(value as never);
|
||||
},
|
||||
});
|
||||
|
||||
watch(
|
||||
ref,
|
||||
(newValue) => {
|
||||
storageRef.value = newValue;
|
||||
},
|
||||
);
|
||||
|
||||
return ref;
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ import { tool as base64FileConverter } from './base64-file-converter';
|
|||
import { tool as base64StringConverter } from './base64-string-converter';
|
||||
import { tool as basicAuthGenerator } from './basic-auth-generator';
|
||||
import { tool as textToUnicode } from './text-to-unicode';
|
||||
import { tool as ocrImage } from './ocr-image';
|
||||
import { tool as pdfSignatureChecker } from './pdf-signature-checker';
|
||||
import { tool as numeronymGenerator } from './numeronym-generator';
|
||||
import { tool as macAddressGenerator } from './mac-address-generator';
|
||||
|
@ -127,7 +128,13 @@ export const toolsByCategory: ToolCategory[] = [
|
|||
},
|
||||
{
|
||||
name: 'Images and videos',
|
||||
components: [qrCodeGenerator, wifiQrCodeGenerator, svgPlaceholderGenerator, cameraRecorder],
|
||||
components: [
|
||||
qrCodeGenerator,
|
||||
wifiQrCodeGenerator,
|
||||
svgPlaceholderGenerator,
|
||||
cameraRecorder,
|
||||
ocrImage,
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'Development',
|
||||
|
|
12
src/tools/ocr-image/index.ts
Normal file
12
src/tools/ocr-image/index.ts
Normal file
|
@ -0,0 +1,12 @@
|
|||
import { Scan } from '@vicons/tabler';
|
||||
import { defineTool } from '../tool';
|
||||
|
||||
export const tool = defineTool({
|
||||
name: 'OCRize image and PDF',
|
||||
path: '/ocr-image',
|
||||
description: 'Perform OCR (Tesseract) on an image or PDF',
|
||||
keywords: ['ocr', 'image', 'tesseract', 'pdf'],
|
||||
component: () => import('./ocr-image.vue'),
|
||||
icon: Scan,
|
||||
createdAt: new Date('2024-03-09'),
|
||||
});
|
225
src/tools/ocr-image/ocr-image.vue
Normal file
225
src/tools/ocr-image/ocr-image.vue
Normal file
|
@ -0,0 +1,225 @@
|
|||
<script setup lang="ts">
|
||||
import type { Ref } from 'vue';
|
||||
import { createWorker } from 'tesseract.js';
|
||||
import { getDocument } from 'pdfjs-dist';
|
||||
import * as pdfJS from 'pdfjs-dist';
|
||||
import pdfJSWorkerURL from 'pdfjs-dist/build/pdf.worker?url';
|
||||
import TextareaCopyable from '@/components/TextareaCopyable.vue';
|
||||
import { useQueryParamOrStorage } from '@/composable/queryParams';
|
||||
|
||||
const languages = [
|
||||
{ name: 'English', code: 'eng' },
|
||||
{ name: 'Portuguese', code: 'por' },
|
||||
{ name: 'Afrikaans', code: 'afr' },
|
||||
{ name: 'Albanian', code: 'sqi' },
|
||||
{ name: 'Amharic', code: 'amh' },
|
||||
{ name: 'Arabic', code: 'ara' },
|
||||
{ name: 'Assamese', code: 'asm' },
|
||||
{ name: 'Azerbaijani', code: 'aze' },
|
||||
{ name: 'Azerbaijani - Cyrillic', code: 'aze_cyrl' },
|
||||
{ name: 'Basque', code: 'eus' },
|
||||
{ name: 'Belarusian', code: 'bel' },
|
||||
{ name: 'Bengali', code: 'ben' },
|
||||
{ name: 'Bosnian', code: 'bos' },
|
||||
{ name: 'Bulgarian', code: 'bul' },
|
||||
{ name: 'Burmese', code: 'mya' },
|
||||
{ name: 'Catalan; Valencian', code: 'cat' },
|
||||
{ name: 'Cebuano', code: 'ceb' },
|
||||
{ name: 'Central Khmer', code: 'khm' },
|
||||
{ name: 'Cherokee', code: 'chr' },
|
||||
{ name: 'Chinese - Simplified', code: 'chi_sim' },
|
||||
{ name: 'Chinese - Traditional', code: 'chi_tra' },
|
||||
{ name: 'Croatian', code: 'hrv' },
|
||||
{ name: 'Czech', code: 'ces' },
|
||||
{ name: 'Danish', code: 'dan' },
|
||||
{ name: 'Dutch; Flemish', code: 'nld' },
|
||||
{ name: 'Dzongkha', code: 'dzo' },
|
||||
{ name: 'English, Middle (1100-1500)', code: 'enm' },
|
||||
{ name: 'Esperanto', code: 'epo' },
|
||||
{ name: 'Estonian', code: 'est' },
|
||||
{ name: 'Finnish', code: 'fin' },
|
||||
{ name: 'French', code: 'fra' },
|
||||
{ name: 'French, Middle (ca. 1400-1600)', code: 'frm' },
|
||||
{ name: 'Galician', code: 'glg' },
|
||||
{ name: 'Georgian', code: 'kat' },
|
||||
{ name: 'German', code: 'deu' },
|
||||
{ name: 'German Fraktur', code: 'frk' },
|
||||
{ name: 'Greek, Modern (1453-)', code: 'ell' },
|
||||
{ name: 'Greek, Ancient (-1453)', code: 'grc' },
|
||||
{ name: 'Gujarati', code: 'guj' },
|
||||
{ name: 'Haitian; Haitian Creole', code: 'hat' },
|
||||
{ name: 'Hebrew', code: 'heb' },
|
||||
{ name: 'Hindi', code: 'hin' },
|
||||
{ name: 'Hungarian', code: 'hun' },
|
||||
{ name: 'Icelandic', code: 'isl' },
|
||||
{ name: 'Indonesian', code: 'ind' },
|
||||
{ name: 'Inuktitut', code: 'iku' },
|
||||
{ name: 'Irish', code: 'gle' },
|
||||
{ name: 'Italian', code: 'ita' },
|
||||
{ name: 'Japanese', code: 'jpn' },
|
||||
{ name: 'Javanese', code: 'jav' },
|
||||
{ name: 'Kannada', code: 'kan' },
|
||||
{ name: 'Kazakh', code: 'kaz' },
|
||||
{ name: 'Kirghiz; Kyrgyz', code: 'kir' },
|
||||
{ name: 'Korean', code: 'kor' },
|
||||
{ name: 'Kurdish', code: 'kur' },
|
||||
{ name: 'Lao', code: 'lao' },
|
||||
{ name: 'Latin', code: 'lat' },
|
||||
{ name: 'Latvian', code: 'lav' },
|
||||
{ name: 'Lithuanian', code: 'lit' },
|
||||
{ name: 'Macedonian', code: 'mkd' },
|
||||
{ name: 'Malay', code: 'msa' },
|
||||
{ name: 'Malayalam', code: 'mal' },
|
||||
{ name: 'Maltese', code: 'mlt' },
|
||||
{ name: 'Marathi', code: 'mar' },
|
||||
{ name: 'Nepali', code: 'nep' },
|
||||
{ name: 'Norwegian', code: 'nor' },
|
||||
{ name: 'Oriya', code: 'ori' },
|
||||
{ name: 'Panjabi; Punjabi', code: 'pan' },
|
||||
{ name: 'Persian', code: 'fas' },
|
||||
{ name: 'Polish', code: 'pol' },
|
||||
{ name: 'Pushto; Pashto', code: 'pus' },
|
||||
{ name: 'Romanian; Moldavian; Moldovan', code: 'ron' },
|
||||
{ name: 'Russian', code: 'rus' },
|
||||
{ name: 'Sanskrit', code: 'san' },
|
||||
{ name: 'Serbian', code: 'srp' },
|
||||
{ name: 'Serbian - Latin', code: 'srp_latn' },
|
||||
{ name: 'Sinhala; Sinhalese', code: 'sin' },
|
||||
{ name: 'Slovak', code: 'slk' },
|
||||
{ name: 'Slovenian', code: 'slv' },
|
||||
{ name: 'Spanish; Castilian', code: 'spa' },
|
||||
{ name: 'Swahili', code: 'swa' },
|
||||
{ name: 'Swedish', code: 'swe' },
|
||||
{ name: 'Syriac', code: 'syr' },
|
||||
{ name: 'Tagalog', code: 'tgl' },
|
||||
{ name: 'Tajik', code: 'tgk' },
|
||||
{ name: 'Tamil', code: 'tam' },
|
||||
{ name: 'Telugu', code: 'tel' },
|
||||
{ name: 'Thai', code: 'tha' },
|
||||
{ name: 'Tibetan', code: 'bod' },
|
||||
{ name: 'Tigrinya', code: 'tir' },
|
||||
{ name: 'Turkish', code: 'tur' },
|
||||
{ name: 'Uighur; Uyghur', code: 'uig' },
|
||||
{ name: 'Ukrainian', code: 'ukr' },
|
||||
{ name: 'Urdu', code: 'urd' },
|
||||
{ name: 'Uzbek', code: 'uzb' },
|
||||
{ name: 'Uzbek - Cyrillic', code: 'uzb_cyrl' },
|
||||
{ name: 'Vietnamese', code: 'vie' },
|
||||
{ name: 'Welsh', code: 'cym' },
|
||||
{ name: 'Yiddish', code: 'yid' },
|
||||
];
|
||||
const languagesOptions = Array.from(languages.map(l => ({
|
||||
label: l.name,
|
||||
value: l.code,
|
||||
})));
|
||||
|
||||
const language = useQueryParamOrStorage({ name: 'lang', storageName: 'ocr-image:lang', defaultValue: 'eng' });
|
||||
|
||||
const ocrInProgress = ref(false);
|
||||
const fileInput = ref() as Ref<File>;
|
||||
const ocrText = computedAsync(async () => {
|
||||
try {
|
||||
return (await ocr(fileInput.value, language.value));
|
||||
}
|
||||
catch (e: any) {
|
||||
return e.toString();
|
||||
}
|
||||
});
|
||||
|
||||
async function onUpload(file: File) {
|
||||
if (file) {
|
||||
fileInput.value = file;
|
||||
}
|
||||
}
|
||||
|
||||
async function loadPdfFile(file: File) {
|
||||
const arrBuffer = await file.arrayBuffer();
|
||||
const byteArray = new Uint8Array(arrBuffer);
|
||||
return getDocument({ data: byteArray }).promise;
|
||||
}
|
||||
|
||||
async function convertPdfToImage(file: File) {
|
||||
const pdf = await loadPdfFile(file);
|
||||
const container = document.getElementById('container');
|
||||
const images = [];
|
||||
for (let pageNumber = 1; pageNumber <= pdf.numPages; pageNumber++) {
|
||||
const page = await pdf.getPage(pageNumber);
|
||||
const viewport = page.getViewport({ scale: 1.5 });
|
||||
const canvas = document.createElement('canvas');
|
||||
canvas.height = viewport.height;
|
||||
canvas.width = viewport.width;
|
||||
container?.appendChild(canvas);
|
||||
await page.render({
|
||||
canvasContext: canvas.getContext('2d') as CanvasRenderingContext2D,
|
||||
viewport,
|
||||
}).promise;
|
||||
images.push(canvas.toDataURL('image/png'));
|
||||
}
|
||||
return images;
|
||||
};
|
||||
|
||||
async function ocr(file: File, language: string) {
|
||||
if (!file) {
|
||||
return '';
|
||||
}
|
||||
ocrInProgress.value = true;
|
||||
const worker = await createWorker();
|
||||
await worker.reinitialize(language);
|
||||
const allTexts = [];
|
||||
if (file.type.match('^image/')) {
|
||||
const ret = await worker.recognize(file);
|
||||
allTexts.push(ret.data.text);
|
||||
}
|
||||
else {
|
||||
pdfJS.GlobalWorkerOptions.workerSrc = pdfJSWorkerURL;
|
||||
|
||||
for (const image of (await convertPdfToImage(file))) {
|
||||
const ret = await worker.recognize(image);
|
||||
allTexts.push(ret.data.text);
|
||||
}
|
||||
}
|
||||
await worker.terminate();
|
||||
ocrInProgress.value = false;
|
||||
return allTexts.join('\n=============\n');
|
||||
};
|
||||
</script>
|
||||
|
||||
<template>
|
||||
<div style="max-width: 600px;">
|
||||
<c-select
|
||||
v-model:value="language"
|
||||
label="Language"
|
||||
:options="languagesOptions"
|
||||
searchable mb-2
|
||||
/>
|
||||
|
||||
<c-file-upload
|
||||
title="Drag and drop a Image or PDF here, or click to select a file"
|
||||
:paste-image="true"
|
||||
@file-upload="onUpload"
|
||||
/>
|
||||
|
||||
<n-divider />
|
||||
|
||||
<div id="container" style="display: none;" />
|
||||
|
||||
<div>
|
||||
<h3>OCR</h3>
|
||||
<TextareaCopyable
|
||||
v-if="!ocrInProgress"
|
||||
v-model:value="ocrText"
|
||||
:word-wrap="true"
|
||||
/>
|
||||
<n-spin
|
||||
v-if="ocrInProgress"
|
||||
size="small"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<style lang="less" scoped>
|
||||
::v-deep(.n-upload-trigger) {
|
||||
width: 100%;
|
||||
}
|
||||
</style>
|
Loading…
Add table
Add a link
Reference in a new issue