add ID attributes to all content tags of an xhtml page

raw

add-ids.php

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env php
<?php
/**
 * Add an ID to each content tag in the given HTML file so
 * that it's possible to link to each paragraph.
 *
 * The rules are:
 * - content tags get an ID (p, li, dd, dt)
 * - IDs of headings are used as prefix for the following content tags
 * - if one content tag (li) contains a content tag (p), the parent content
 *   tag (li) gets its id removed
 * - ids get prefixed
 *
 * @author Christian Weiske <cweiske@cweiske.de>
 */
$bRemoveContainerIdsOnContent = false;
 
if ($argc < 2) {
    echo "Please pass a file\n";
    exit(1);
}
$file = $argv[1];
if (!file_exists($file)) {
    echo "File does not exist: $file\n";
    exit(2);
}
if (substr($file, -4) != '.htm') {
    echo "add-ids works only on htm files\n";
    exit(3);
}
 
require_once 'autoload.php';
 
$sx = simplexml_load_file($file);
if ($sx === false) {
    echo "Failed to load XML\n";
    exit(4);    
}
 
//tags whose ID is used as prefix for the next
$refTags = array_flip(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'));
$idTags = array(
    'blockquote' => 'q',
    'dd' => 'd',
    'dl' => 'l',
    'dt' => 't',
    'h1' => 'h',
    'h2' => 'h',
    'h3' => 'h',
    'h4' => 'h',
    'h5' => 'h',
    'h6' => 'h',
    'li' => 'i',
    'ol' => 'l',
    'p'  => 'p',
    'pre' => 'c',
    'ul' => 'l',
);
$containerTags = array_flip(
    array(
        'blockquote', 'div',
        'dl', 'ol', 'ul', 'li'
    )
);
$idPrefix    = '';
$counter     = array();
$existingIds = array();
 
addIds($sx->body->children());
 
function addIds($tags) {
    global $containerTags, $idTags, $refTags,
        $idPrefix, $counter, $existingIds, $bRemoveContainerIdsOnContent;
 
    $contentIds = 0;
    foreach ($tags as $tag) {
        $tagname = $tag->getName();
        if (!array_key_exists($tagname, $idTags)
            && !array_key_exists($tagname, $containerTags)) {
            continue;
        }
 
        if (isset($tag['id'])) {
            $existingIds[(string) $tag['id']] = true;
            if (isset($refTags[$tagname])) {
                $idPrefix = $tag['id'] . '-';
                //reset counter, but keep heading counter
                foreach ($counter as $key => $val) {
                    if ($key != 'h') {
                        $counter[$key] = 0;
                    }
                }
            }
        } else if (isset($idTags[$tagname])) {
            //add ID to element
            $idtagname = $idTags[$tagname];
            if (!isset($counter[$idtagname])) {
                $counter[$idtagname] = 0;
            }
 
            do {
                $newId = $idPrefix . $idtagname . ++$counter[$idtagname];
            } while (isset($existingIds[$newId]));
            $tag['id'] = $newId;
            $existingIds[$newId] = true;
        }
 
        if (isset($containerTags[$tagname])) {
            $childContentIds = addIds($tag->children());
            $contentIds += $childContentIds;
 
            if ($bRemoveContainerIdsOnContent && $childContentIds > 0) {
                unset($tag['id']);
            }
        } else if (isset($idTags[$tagname])) {
            $contentIds++;
        }
 
    }
    return $contentIds;
}
 
$newXml = $sx->asXML();
if (strlen($newXml) < filesize($file)) {
    echo "Something went wrong when adding IDs to $file\n";
    exit(10);
}
file_put_contents($file, $newXml);
?>
 
Christian Weiske Christian Weiske
owner

History