HTML base64 image extractor

raw

htmlbase64extractor.php

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env php
<?php
/**
 * Extract base64 encoded images from an HTML file.
 * Saves the images as files and replaces the base64 images with
 * the file paths.
 *
 * Useful for LibreOffice/OpenOffice HTML exports.
 *
 * Usage:
 * $ ./htmlbase64extractor.php file.html > fixed.html
 *
 * Licensed in the public domain
 *
 * @author Christian Weiske <weiske@mogic.com>
 */
if ($argc < 2) {
    echo "File missing\n";
    exit(1);
}
 
$map = array(
    'image/jpg'  => 'jpg',
    'image/jpeg' => 'jpg',
    'image/png'  => 'png',
    'image/gif'  => 'gif',
);
 
$file = $argv[1];
$imgdir    = $file . '-images/';
$relimgdir = basename($file . '-images/') . '/';
if (!is_dir($imgdir)) {
    mkdir($imgdir);
}
$content = file_get_contents($file);
 
$imgcounter = 0;
$content2 = preg_replace_callback(
    '#data:(image/[^;]*);base64,([a-zA-Z0-9+/=]*)#',
    'extractImage',
    $content
);
 
echo $content2;
 
 
function extractImage($matches)
{
    global $imgcounter, $imgdir, $relimgdir, $map;
 
    list(, $type, $base64) = $matches;
    if ($type == 'image/*') {
        //fixme: detect image type
        $type = 'image/png';
    }
    $filename = 'image-' . $imgcounter++ . '.' . $map[$type];
    file_put_contents($imgdir . $filename, base64_decode($base64));
    return $relimgdir . $filename;
}
?>
 
Christian Weiske Christian Weiske
owner

History