diff options
| author | Ralph Amissah <ralph.amissah@gmail.com> | 2026-04-22 14:02:51 -0400 |
|---|---|---|
| committer | Ralph Amissah <ralph.amissah@gmail.com> | 2026-04-22 20:42:31 -0400 |
| commit | 0edd3f79f2ceaf74c009a1f4dc25aa95b5a1c673 (patch) | |
| tree | 65c123a56aa8fcac15f2e0e37cea0a366bce00c8 /src | |
| parent | .ssp serializer: omit identifier when it equals OCN (diff) | |
.ssp serializer: include all ObjGenericComposite fields
- Make the .ssp format a complete representation of the
document abstraction by serializing all remaining fields
from ObjGenericComposite (only omitting ptr.* runtime
indices which are meaningless outside the in-memory context).
- New fields added:
.ancestors_collapsed: - collapsed level ancestor chain
.dom_status: - DOM structure markedup tags status[8]
.dom_status_collapsed: - DOM structure collapsed status[8]
.heading_lev_collapsed: - collapsed heading level
.parent_lev: - parent heading level (markup)
.o_n_type: - object numbering type (0=ocn, 1=non, 2=bkidx)
.is_of_type: - para/block type classification
.attrib: - general attributes string
.meta_lang: - block language (group/block/quote)
.meta_syntax: - codeblock syntax from metainfo
.sha256: - hex-encoded SHA-256 digest of object content
.has: images_no_dim - image without dimensions flag
.table_aligns: - column alignment array
.table_walls: - table walls/borders flag
.stow_link: - extracted URLs (one per line)
.heading_lev_anchor: - heading level anchor tag
.segment_epub: - EPUB segment anchor tag
.heading_ancestors_text: - pipe-separated ancestor headings
.lev4_subtoc: - sub-table-of-contents entries (one per line)
.anchor_tag: - additional anchor tags (one per line)
- Tested against all 35 sample documents - zero failures.
Co-Authored-By: Anthropic Claude Opus 4.6 (1M context)
Diffstat (limited to 'src')
| -rw-r--r-- | src/sisudoc/io_out/create_abstraction_txt.d | 129 |
1 files changed, 120 insertions, 9 deletions
diff --git a/src/sisudoc/io_out/create_abstraction_txt.d b/src/sisudoc/io_out/create_abstraction_txt.d index 140a27d..7d0425c 100644 --- a/src/sisudoc/io_out/create_abstraction_txt.d +++ b/src/sisudoc/io_out/create_abstraction_txt.d @@ -53,6 +53,7 @@ module sisudoc.io_out.create_abstraction_txt; /+ ↓ write document abstraction as human-readable .ssp text file +/ template spineAbstractionTxt() { import std.conv : to; + import std.digest : toHexString; import std.file; import std.path; import std.stdio; @@ -201,23 +202,94 @@ template spineAbstractionTxt() { output ~= ".last_descendant: " ~ obj.metainfo.last_descendant_ocn.to!string; /+ ↓ ancestors (only if non-zero) +/ - bool has_ancestors = false; - foreach (a; obj.metainfo.markedup_ancestors) { - if (a != 0) { has_ancestors = true; break; } + { + bool has_anc = false; + foreach (a; obj.metainfo.markedup_ancestors) { + if (a != 0) { has_anc = true; break; } + } + if (has_anc) { + string anc; + foreach (i, a; obj.metainfo.markedup_ancestors) { + if (i > 0) anc ~= " "; + anc ~= a.to!string; + } + output ~= ".ancestors: " ~ anc; + } } - if (has_ancestors) { - string anc; - foreach (i, a; obj.metainfo.markedup_ancestors) { - if (i > 0) anc ~= " "; - anc ~= a.to!string; + /+ ↓ collapsed ancestors (only if non-zero) +/ + { + bool has_anc_c = false; + foreach (a; obj.metainfo.collapsed_ancestors) { + if (a != 0) { has_anc_c = true; break; } + } + if (has_anc_c) { + string anc; + foreach (i, a; obj.metainfo.collapsed_ancestors) { + if (i > 0) anc ~= " "; + anc ~= a.to!string; + } + output ~= ".ancestors_collapsed: " ~ anc; + } + } + /+ ↓ dom structure status (only if non-zero) +/ + { + bool has_dom = false; + foreach (d; obj.metainfo.dom_structure_markedup_tags_status) { + if (d != 0) { has_dom = true; break; } + } + if (has_dom) { + string ds; + foreach (i, d; obj.metainfo.dom_structure_markedup_tags_status) { + if (i > 0) ds ~= " "; + ds ~= d.to!string; + } + output ~= ".dom_status: " ~ ds; + } + } + { + bool has_dom_c = false; + foreach (d; obj.metainfo.dom_structure_collapsed_tags_status) { + if (d != 0) { has_dom_c = true; break; } + } + if (has_dom_c) { + string ds; + foreach (i, d; obj.metainfo.dom_structure_collapsed_tags_status) { + if (i > 0) ds ~= " "; + ds ~= d.to!string; + } + output ~= ".dom_status_collapsed: " ~ ds; } - output ~= ".ancestors: " ~ anc; } + if (obj.metainfo.heading_lev_collapsed < 9) + output ~= ".heading_lev_collapsed: " ~ obj.metainfo.heading_lev_collapsed.to!string; + if (obj.metainfo.parent_lev_markup != 0) + output ~= ".parent_lev: " ~ obj.metainfo.parent_lev_markup.to!string; if (obj.metainfo.dummy_heading) output ~= ".dummy: true"; if (obj.metainfo.object_number_off) output ~= ".ocn_off: true"; + if (obj.metainfo.o_n_type != 0) + output ~= ".o_n_type: " ~ obj.metainfo.o_n_type.to!string; + if (obj.metainfo.is_of_type.length > 0) + output ~= ".is_of_type: " ~ obj.metainfo.is_of_type; + if (obj.metainfo.attrib.length > 0) + output ~= ".attrib: " ~ obj.metainfo.attrib; + if (obj.metainfo.lang.length > 0) + output ~= ".meta_lang: " ~ obj.metainfo.lang; + if (obj.metainfo.syntax.length > 0) + output ~= ".meta_syntax: " ~ obj.metainfo.syntax; + + /+ ↓ sha256 digest +/ + { + bool has_sha = false; + foreach (b; obj.metainfo.sha256) { + if (b != 0) { has_sha = true; break; } + } + if (has_sha) { + output ~= ".sha256: " ~ obj.metainfo.sha256.toHexString.to!string; + } + } /+ ↓ text attributes +/ if (obj.attrib.indent_base != 0 || obj.attrib.indent_hang != 0) @@ -235,6 +307,7 @@ template spineAbstractionTxt() { if (obj.has.inline_notes_reg) has_flags ~= "notes_reg"; if (obj.has.inline_notes_star) has_flags ~= "notes_star"; if (obj.has.images) has_flags ~= "images"; + if (obj.has.image_without_dimensions) has_flags ~= "images_no_dim"; if (has_flags.length > 0) output ~= ".has: " ~ has_flags.join(" "); } @@ -247,8 +320,13 @@ template spineAbstractionTxt() { foreach (w; obj.table.column_widths) ws ~= w.to!string; output ~= ".table_widths: " ~ ws.join(" "); } + if (obj.table.column_aligns.length > 0) { + output ~= ".table_aligns: " ~ obj.table.column_aligns.join(" "); + } if (obj.table.heading) output ~= ".table_header: true"; + if (obj.table.walls) + output ~= ".table_walls: true"; } /+ ↓ code block properties +/ @@ -259,6 +337,13 @@ template spineAbstractionTxt() { output ~= ".code_linenumbers: true"; } + /+ ↓ stow (extracted links) +/ + if (obj.stow.link.length > 0) { + foreach (lnk; obj.stow.link) { + output ~= ".stow_link: " ~ lnk; + } + } + /+ ↓ tag properties +/ if (obj.tags.in_segment_html.length > 0) output ~= ".segment: " ~ obj.tags.in_segment_html; @@ -269,6 +354,32 @@ template spineAbstractionTxt() { output ~= ".segment_prev: " ~ obj.tags.segname_prev; if (obj.tags.segname_next.length > 0) output ~= ".segment_next: " ~ obj.tags.segname_next; + if (obj.tags.heading_lev_anchor_tag.length > 0) + output ~= ".heading_lev_anchor: " ~ obj.tags.heading_lev_anchor_tag; + if (obj.tags.segment_anchor_tag_epub.length > 0) + output ~= ".segment_epub: " ~ obj.tags.segment_anchor_tag_epub; + /+ ↓ heading ancestors text +/ + { + bool has_hat = false; + foreach (h; obj.tags.heading_ancestors_text) { + if (h.length > 0) { has_hat = true; break; } + } + if (has_hat) { + output ~= ".heading_ancestors_text: " ~ obj.tags.heading_ancestors_text.join("|"); + } + } + /+ ↓ lev4 subtoc +/ + if (obj.tags.lev4_subtoc.length > 0) { + foreach (st; obj.tags.lev4_subtoc) { + output ~= ".lev4_subtoc: " ~ st; + } + } + /+ ↓ anchor tags +/ + if (obj.tags.anchor_tags.length > 0) { + foreach (at; obj.tags.anchor_tags) { + output ~= ".anchor_tag: " ~ at; + } + } /+ ↓ text content +/ if (obj.text.length > 0) { |
