% \iffalse meta-comment % %% File: tagpdf-tree.dtx % % Copyright (C) 2019-2024 Ulrike Fischer % % It may be distributed and/or modified under the conditions of the % LaTeX Project Public License (LPPL), either version 1.3c of this % license or (at your option) any later version. The latest version % of this license is in the file % % https://www.latex-project.org/lppl.txt % % This file is part of the "tagpdf bundle" (The Work in LPPL) % and all files in that bundle must be distributed together. % % ----------------------------------------------------------------------- % % The development version of the bundle can be found at % % https://github.com/latex3/tagpdf % % for those people who are interested. % %<*driver> \DocumentMetadata{} \documentclass{l3doc} \usepackage{array,booktabs,caption} \hypersetup{pdfauthor=Ulrike Fischer, pdftitle=tagpdf-tree module (tagpdf)} \begin{document} \DocInput{\jobname.dtx} \end{document} % % \fi % \title{^^A % The \pkg{tagpdf-tree} module\\ Commands trees and main dictionaries ^^A % \\ Part of the tagpdf package % } % % \author{^^A % Ulrike Fischer\thanks % {^^A % E-mail: % \href{mailto:fischer@troubleshooting-tex.de} % {fischer@troubleshooting-tex.de}^^A % }^^A % } % % \date{Version 0.99i, released 2024-11-19} % \maketitle % \begin{implementation} % \begin{macrocode} %<@@=tag> %<*header> \ProvidesExplPackage {tagpdf-tree-code} {2024-11-19} {0.99i} {part of tagpdf - code related to writing trees and dictionaries to the pdf} % % \end{macrocode} % \section{Trees, pdfmanagement and finalization code} % % The code to finish the structure is in a hook. % This will perhaps at the end be a kernel hook. % TODO check right place for the code % The pdfmanagement code is the kernel hook after % shipout/lastpage so all code affecting it should be before. % Objects can be written later, at least in pdf mode. % \begin{macrocode} %<*package> \hook_gput_code:nnn{begindocument}{tagpdf} { \bool_if:NT \g_@@_active_tree_bool { \sys_if_output_pdf:TF { \AddToHook{enddocument/end} { \@@_finish_structure: } } { \AddToHook{shipout/lastpage} { \@@_finish_structure: } } } } % \end{macrocode} % \subsection{Check structure} % \begin{macro}{\@@_tree_final_checks:} % \begin{macrocode} \cs_new_protected:Npn \@@_tree_final_checks: { \int_compare:nNnF {\seq_count:N\g_@@_struct_stack_seq}={1} { \msg_warning:nn {tag}{tree-struct-still-open} \int_step_inline:nnn{2}{\seq_count:N\g_@@_struct_stack_seq} {\tag_struct_end:} } \msg_note:nn {tag}{tree-statistic} } % \end{macrocode} % \end{macro} % % \subsection{Catalog: MarkInfo and StructTreeRoot and OpenAction} % The StructTreeRoot and the MarkInfo entry must be added to the catalog. % If there is an OpenAction entry we must update it, % so that it contains also a structure destination. % We do it late so that we can win, but before the pdfmanagement hook. % \begin{macro}{@@/struct/1} % This is the object for the root object, the StructTreeRoot % \begin{macrocode} \pdf_object_new_indexed:nn { @@/struct }{ 1 } % \end{macrocode} % \end{macro} % % \begin{variable}{\g_@@_tree_openaction_struct_tl} % We need a variable that indicates which structure is wanted in the OpenAction. By default we use % 2 (the Document structure). % \begin{macrocode} \tl_new:N \g_@@_tree_openaction_struct_tl \tl_gset:Nn \g_@@_tree_openaction_struct_tl { 2 } % \end{macrocode} % \end{variable} % % \begin{macro}{viewer/startstructure (setup-key)} % We also need an option to setup the start structure. So we setup a key % which sets the variable to the current structure. This still requires % hyperref to do most of the job, this should perhaps be changed. % \begin{macrocode} \keys_define:nn { @@ / setup } { viewer/startstructure .code:n = { \tl_gset:Ne \g_@@_tree_openaction_struct_tl {#1} } ,viewer/startstructure .default:n = { \int_use:N \c@g_@@_struct_abs_int } } % \end{macrocode} % \end{macro} % The OpenAction should only be updated if it is there. So we inspect the % Catalog-prop: % \begin{macrocode} \cs_new_protected:Npn \@@_tree_update_openaction: { \prop_get:cnNT { \__kernel_pdfdict_name:n { g__pdf_Core/Catalog } } {OpenAction} \l_@@_tmpa_tl { % \end{macrocode} % we only do something if the OpenAction is an array (as set by hyperref) % in other cases we hope that the author knows what they did. % \begin{macrocode} \tl_if_head_eq_charcode:eNT { \tl_trim_spaces:V\l_@@_tmpa_tl } [ %] { \seq_set_split:NnV\l_@@_tmpa_seq{/}\l_@@_tmpa_tl \pdfmanagement_add:nne {Catalog} { OpenAction } { << /S/GoTo \c_space_tl /D~\l_@@_tmpa_tl\c_space_tl /SD~[\pdf_object_ref_indexed:nn{@@/struct}{\g_@@_tree_openaction_struct_tl} % \end{macrocode} % there should be always a /Fit etc in the array but better play safe here ... % \begin{macrocode} \int_compare:nNnTF{ \seq_count:N \l_@@_tmpa_seq } > {1} { /\seq_item:Nn\l_@@_tmpa_seq{2} } { ] } >> } } } } % \end{macrocode} % % \begin{macrocode} \hook_gput_code:nnn{shipout/lastpage}{tagpdf} { \bool_if:NT \g_@@_active_tree_bool { \pdfmanagement_add:nnn { Catalog / MarkInfo } { Marked } { true } \pdfmanagement_add:nne { Catalog } { StructTreeRoot } { \pdf_object_ref_indexed:nn { @@/struct } { 1 } } \@@_tree_update_openaction: } } % \end{macrocode} % % \subsection{Writing the IDtree} % % The ID are currently quite simple: every structure has an ID build from % the prefix ID together with the structure number padded with enough zeros to % that we get directly an lexical order. We ship them out in bundles % At first a seq to hold the references for the kids % \begin{variable}{\g_@@_tree_id_pad_int} % \begin{macrocode} \int_new:N\g_@@_tree_id_pad_int % \end{macrocode} % \end{variable} % Now we get the needed padding % \begin{macrocode} \cs_generate_variant:Nn \tl_count:n {e} \hook_gput_code:nnn{begindocument}{tagpdf} { \int_gset:Nn\g_@@_tree_id_pad_int {\tl_count:e { \@@_property_ref_lastpage:nn{tagstruct}{1000}}+1} } % \end{macrocode} % This is the main code to write the tree it basically splits the % existing structure numbers in chunks of length 50 % TODO consider is 50 is a good length. % \begin{macrocode} \cs_new_protected:Npn \@@_tree_write_idtree: { \tl_clear:N \l_@@_tmpa_tl \tl_clear:N \l_@@_tmpb_tl \int_zero:N \l_@@_tmpa_int \int_step_inline:nnn {2} {\c@g_@@_struct_abs_int} { \int_incr:N\l_@@_tmpa_int \tl_put_right:Ne \l_@@_tmpa_tl { \@@_struct_get_id:n{##1}~\pdf_object_ref_indexed:nn {@@/struct}{##1}~ } \int_compare:nNnF {\l_@@_tmpa_int}<{50} % { \pdf_object_unnamed_write:ne {dict} { /Limits~[\@@_struct_get_id:n{##1-\l_@@_tmpa_int+1}~\@@_struct_get_id:n{##1}] /Names~[\l_@@_tmpa_tl] } \tl_put_right:Ne\l_@@_tmpb_tl {\pdf_object_ref_last:\c_space_tl} \int_zero:N \l_@@_tmpa_int \tl_clear:N \l_@@_tmpa_tl } } \tl_if_empty:NF \l_@@_tmpa_tl { \pdf_object_unnamed_write:ne {dict} { /Limits~ [\@@_struct_get_id:n{\c@g_@@_struct_abs_int-\l_@@_tmpa_int+1}~ \@@_struct_get_id:n{\c@g_@@_struct_abs_int}] /Names~[\l_@@_tmpa_tl] } \tl_put_right:Ne\l_@@_tmpb_tl {\pdf_object_ref_last:} } \pdf_object_unnamed_write:ne {dict}{/Kids~[\l_@@_tmpb_tl]} \@@_prop_gput:cne { g_@@_struct_1_prop } { IDTree } { \pdf_object_ref_last: } } % \end{macrocode} % % \subsection{Writing structure elements} % The following commands are needed to write out the structure. % \begin{macro}{\@@_tree_write_structtreeroot:} % This writes out the root object. % \begin{macrocode} \cs_new_protected:Npn \@@_tree_write_structtreeroot: { \@@_prop_gput:cne { g_@@_struct_1_prop } { ParentTree } { \pdf_object_ref:n { @@/tree/parenttree } } \@@_prop_gput:cne { g_@@_struct_1_prop } { RoleMap } { \pdf_object_ref:n { @@/tree/rolemap } } \@@_struct_fill_kid_key:n { 1 } \prop_gremove:cn { g_@@_struct_1_prop } {S} \@@_struct_get_dict_content:nN { 1 } \l_@@_tmpa_tl \pdf_object_write_indexed:nnne { @@/struct } { 1 } {dict} { \l_@@_tmpa_tl } % \end{macrocode} % Better put S back, see https://github.com/latex3/tagging-project/issues/86 % \begin{macrocode} \prop_gput:cnn { g_@@_struct_1_prop } {S}{ /StructTreeRoot } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_tree_write_structelements:} % This writes out the other struct elems, the absolute number is in the counter. % \begin{macrocode} \cs_new_protected:Npn \@@_tree_write_structelements: { \int_step_inline:nnnn {2}{1}{\c@g_@@_struct_abs_int} { \@@_struct_write_obj:n { ##1 } } } % \end{macrocode} % \end{macro} % % \subsection{ParentTree} % \begin{macro}{@@/tree/parenttree} % The object which will hold the parenttree % \begin{macrocode} \pdf_object_new:n { @@/tree/parenttree } % \end{macrocode} % \end{macro} % The ParentTree maps numbers to objects or (if the number represents a page) % to arrays of objects. The numbers refer to two dictinct types of entries: % page streams and real objects like annotations. % The numbers must be distinct and ordered. % So we rely on abspage for the pages and put the real objects at the end. % We use a counter to have a chance to get the correct number % if code is processed twice. % % \begin{macro}{\c@g_@@_parenttree_obj_int} % This is a counter for the real objects. It starts at the absolute last page % value. It relies on l3ref. % \begin{macrocode} \newcounter { g_@@_parenttree_obj_int } \hook_gput_code:nnn{begindocument}{tagpdf} { \int_gset:Nn \c@g_@@_parenttree_obj_int { \@@_property_ref_lastpage:nn{abspage}{100} } } % \end{macrocode} % \end{macro} % We store the number/object references in a tl-var. If more structure is needed % one could switch to a seq. % \begin{variable}{ \g_@@_parenttree_objr_tl } % \begin{macrocode} \tl_new:N \g_@@_parenttree_objr_tl % \end{macrocode} % \end{variable} % % \begin{macro}{ \@@_parenttree_add_objr:nn } % This command stores a StructParent number and a objref into the tl var. % This is only for objects like annotations, pages are handled elsewhere. % \begin{macrocode} \cs_new_protected:Npn \@@_parenttree_add_objr:nn #1 #2 %#1 StructParent number, #2 objref { \tl_gput_right:Ne \g_@@_parenttree_objr_tl { #1 \c_space_tl #2 ^^J } } % \end{macrocode} % \end{macro} % % \begin{variable}{\l_@@_parenttree_content_tl} % A tl-var which will get the page related parenttree content. % \begin{macrocode} \tl_new:N \l_@@_parenttree_content_tl % \end{macrocode} % \end{variable} % % \begin{macro}{\@@_tree_fill_parenttree:} % This is the main command to assemble the page related entries of the parent tree. % It wanders through the pages and the mcid numbers and collects all mcid of one page. % \begin{macrocode} \cs_new_protected:Npn \@@_tree_parenttree_rerun_msg: {} \cs_new_protected:Npn \@@_tree_fill_parenttree: { \int_step_inline:nnnn{1}{1}{\@@_property_ref_lastpage:nn{abspage}{-1}} %not quite clear if labels are needed. See lua code { %page ##1 \prop_clear:N \l_@@_tmpa_prop \int_step_inline:nnnn{1}{1}{\@@_property_ref_lastpage:nn{tagmcabs}{-1}} { %mcid####1 \int_compare:nT {\property_ref:enn{mcid-####1}{tagabspage}{-1}=##1} %mcid is on current page {% yes \prop_put:Nee \l_@@_tmpa_prop {\property_ref:enn{mcid-####1}{tagmcid}{-1}} {\prop_item:Nn \g_@@_mc_parenttree_prop {####1}} } } \tl_put_right:Ne\l_@@_parenttree_content_tl { \int_eval:n {##1-1}\c_space_tl [\c_space_tl %] } \int_step_inline:nnnn %####1 {0} {1} { \prop_count:N \l_@@_tmpa_prop -1 } { \prop_get:NnNTF \l_@@_tmpa_prop {####1} \l_@@_tmpa_tl {% page#1:mcid##1:\l_@@_tmpa_tl :content \tl_put_right:Ne \l_@@_parenttree_content_tl { \prop_if_exist:cTF { g_@@_struct_ \l_@@_tmpa_tl _prop } { \pdf_object_ref_indexed:nn { @@/struct }{ \l_@@_tmpa_tl } } { null } \c_space_tl } } { \cs_set_protected:Npn \@@_tree_parenttree_rerun_msg: { \msg_warning:nn { tag } {tree-mcid-index-wrong} } } } \tl_put_right:Nn \l_@@_parenttree_content_tl {%[ ]^^J } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_tree_lua_fill_parenttree:} % This is a special variant for luatex. % lua mode must/can do it differently. % \begin{macrocode} \cs_new_protected:Npn \@@_tree_lua_fill_parenttree: { \tl_set:Nn \l_@@_parenttree_content_tl { \lua_now:e { ltx.@@.func.output_parenttree ( \int_use:N\g_shipout_readonly_int ) } } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_tree_write_parenttree:} % This combines the two parts and writes out the object. % TODO should the check for lua be moved into the backend code? % \begin{macrocode} \cs_new_protected:Npn \@@_tree_write_parenttree: { \bool_if:NTF \g_@@_mode_lua_bool { \@@_tree_lua_fill_parenttree: } { \@@_tree_fill_parenttree: } \@@_tree_parenttree_rerun_msg: \tl_put_right:NV \l_@@_parenttree_content_tl\g_@@_parenttree_objr_tl \pdf_object_write:nne { @@/tree/parenttree }{dict} { /Nums\c_space_tl [\l_@@_parenttree_content_tl] } } % \end{macrocode} % \end{macro} % % \subsection{Rolemap dictionary} % The Rolemap dictionary describes relations between new tags and standard types. % The main part here is handled in the role module, here we only define the % command which writes it to the PDF. % \begin{variable}{@@/tree/rolemap} % At first we reserve again an object. % Rolemap is also used in PDF 2.0 as a fallback. % \begin{macrocode} \pdf_object_new:n { @@/tree/rolemap } % \end{macrocode} % \end{variable} % % \begin{macro}{\@@_tree_write_rolemap:} % This writes out the rolemap, basically it simply pushes out % the dictionary which has been filled in the role module. % \begin{macrocode} \cs_new_protected:Npn \@@_tree_write_rolemap: { \bool_if:NT \g_@@_role_add_mathml_bool { \prop_map_inline:Nn \g_@@_role_NS_mathml_prop { \prop_gput:Nnn \g_@@_role_rolemap_prop {##1}{Span} } } \prop_map_inline:Nn\g_@@_role_rolemap_prop { \tl_if_eq:nnF {##1}{##2} { \pdfdict_gput:nne {g_@@_role/RoleMap_dict} {##1} {\pdf_name_from_unicode_e:n{##2}} } } \pdf_object_write:nne { @@/tree/rolemap }{dict} { \pdfdict_use:n{g_@@_role/RoleMap_dict} } } % \end{macrocode} % \end{macro} % % \subsection{Classmap dictionary} % Classmap and attributes are setup in the struct module, here is only the % code to write it out. It should only done if values have been used. % \begin{macro}{\@@_tree_write_classmap:} % \begin{macrocode} \cs_new_protected:Npn \@@_tree_write_classmap: { \tl_clear:N \l_@@_tmpa_tl % \end{macrocode} % We process the older sec for compatibility with the table code. % TODO: check if still needed % \begin{macrocode} \seq_map_inline:Nn \g_@@_attr_class_used_seq { \prop_gput:Nnn \g_@@_attr_class_used_prop {##1}{} } \prop_map_inline:Nn \g_@@_attr_class_used_prop { \tl_put_right:Ne \l_@@_tmpa_tl { ##1\c_space_tl << \prop_item:Nn \g_@@_attr_entries_prop {##1} >> \iow_newline: } } \tl_if_empty:NF \l_@@_tmpa_tl { \pdf_object_new:n { @@/tree/classmap } \pdf_object_write:nne { @@/tree/classmap } {dict} { \l_@@_tmpa_tl } \@@_prop_gput:cne { g_@@_struct_1_prop } { ClassMap } { \pdf_object_ref:n { @@/tree/classmap } } } } % \end{macrocode} % \end{macro} % \subsection{Namespaces} % Namespaces are handle in the role module, here is the code to write them out. % Namespaces are only relevant for pdf2.0. % \begin{variable}{@@/tree/namespaces} % \begin{macrocode} \pdf_object_new:n { @@/tree/namespaces } % \end{macrocode} % \end{variable} % \begin{macro}{\@@_tree_write_namespaces:} % \begin{macrocode} \cs_new_protected:Npn \@@_tree_write_namespaces: { \pdf_version_compare:NnF < {2.0} { \prop_map_inline:Nn \g_@@_role_NS_prop { \pdfdict_if_empty:nF {g_@@_role/RoleMapNS_##1_dict} { \pdf_object_write:nne {@@/RoleMapNS/##1}{dict} { \pdfdict_use:n {g_@@_role/RoleMapNS_##1_dict} } \pdfdict_gput:nne{g_@@_role/Namespace_##1_dict} {RoleMapNS}{\pdf_object_ref:n {@@/RoleMapNS/##1}} } \pdf_object_write:nne{tag/NS/##1}{dict} { \pdfdict_use:n {g_@@_role/Namespace_##1_dict} } } \pdf_object_write:nne {@@/tree/namespaces}{array} { \prop_map_tokens:Nn \g_@@_role_NS_prop{\use_ii:nn} } } } % \end{macrocode} % \end{macro} % \subsection{Finishing the structure} % This assembles the various parts. % TODO (when tabular are done or if someone requests it): IDTree % \begin{macro}{ \@@_finish_structure: } % \begin{macrocode} \hook_new:n {tagpdf/finish/before} \cs_new_protected:Npn \@@_finish_structure: { \bool_if:NT\g_@@_active_tree_bool { \hook_use:n {tagpdf/finish/before} \@@_tree_final_checks: \iow_term:n{Package~tagpdf~Info:~writing~ParentTree} \@@_check_benchmark_tic: \@@_tree_write_parenttree: \@@_check_benchmark_toc: \iow_term:n{Package~tagpdf~Info:~writing~IDTree} \@@_check_benchmark_tic: \@@_tree_write_idtree: \@@_check_benchmark_toc: \iow_term:n{Package~tagpdf~Info:~writing~RoleMap} \@@_check_benchmark_tic: \@@_tree_write_rolemap: \@@_check_benchmark_toc: \iow_term:n{Package~tagpdf~Info:~writing~ClassMap} \@@_check_benchmark_tic: \@@_tree_write_classmap: \@@_check_benchmark_toc: \iow_term:n{Package~tagpdf~Info:~writing~NameSpaces} \@@_check_benchmark_tic: \@@_tree_write_namespaces: \@@_check_benchmark_toc: \iow_term:n{Package~tagpdf~Info:~writing~StructElems} \@@_check_benchmark_tic: \@@_tree_write_structelements: %this is rather slow!! \@@_check_benchmark_toc: \iow_term:n{Package~tagpdf~Info:~writing~Root} \@@_check_benchmark_tic: \@@_tree_write_structtreeroot: \@@_check_benchmark_toc: } } % % \end{macrocode} % \end{macro} % % \subsection{StructParents entry for Page} % We need to add to the Page resources the |StructParents| entry, this is simply the % absolute page number. % \begin{macrocode} %<*package> \hook_gput_code:nnn{begindocument}{tagpdf} { \bool_if:NT\g_@@_active_tree_bool { \hook_gput_code:nnn{shipout/before} { tagpdf/structparents } { \pdfmanagement_add:nne { Page } { StructParents } { \int_eval:n { \g_shipout_readonly_int} } } } } % % \end{macrocode} % \end{implementation} % \PrintIndex