diff --git a/src/neo4j_graphrag/experimental/components/text_splitters/__init__.py b/src/neo4j_graphrag/experimental/components/text_splitters/__init__.py index e69de29bb..d128c3410 100644 --- a/src/neo4j_graphrag/experimental/components/text_splitters/__init__.py +++ b/src/neo4j_graphrag/experimental/components/text_splitters/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) "Neo4j" +# Neo4j Sweden AB [https://neo4j.com] +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# https://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + from neo4j_graphrag.experimental.components.text_splitters.hierarchical_splitter import ( + HierarchicalTextSplitter, + ) + + __all__ = ["HierarchicalTextSplitter"] +except ImportError: + pass diff --git a/src/neo4j_graphrag/experimental/components/text_splitters/hierarchical_splitter.py b/src/neo4j_graphrag/experimental/components/text_splitters/hierarchical_splitter.py new file mode 100644 index 000000000..32918377e --- /dev/null +++ b/src/neo4j_graphrag/experimental/components/text_splitters/hierarchical_splitter.py @@ -0,0 +1,452 @@ +# Copyright (c) "Neo4j" +# Neo4j Sweden AB [https://neo4j.com] +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# https://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Hierarchical text splitter that detects section boundaries before chunking. + +Supported header strategies: + +* ``"markdown"`` — ATX Markdown header lines (``# H1``, ``## H2``, … up to ``###### H6``): one to six ``#`` characters followed by a space and header text. +* ``"capitalization"`` — short Title Case or ALL_CAPS lines without terminal + punctuation (appropriate for plain-text output from loaders like + LiteParseLoader). +* ``"blank_line"`` — short lines that are surrounded by blank lines on both + sides (a common plain-text section marker). +* ``"spacy_verbless"`` — SpaCy-parsed sentences that are short, contain no + verb, and are followed by a longer sentence. + +All strategies produce a list of *sections* (contiguous text blocks). Each +section is then emitted as a single chunk when it fits within ``max_chunk_size``, +or recursively split with ``chunk_overlap`` when it is larger. + +Optionally, when ``drop_verbless_sentences=True`` (default), SpaCy is used to +remove sentences with no verb token from every emitted chunk. +""" + +from __future__ import annotations + +import re +from typing import Any, Optional + +from pydantic import validate_call + +from neo4j_graphrag.experimental.components.text_splitters.base import TextSplitter +from neo4j_graphrag.experimental.components.types import TextChunk, TextChunks + +# --------------------------------------------------------------------------- +# Header-detection helpers +# --------------------------------------------------------------------------- + +_MARKDOWN_HEADER_RE = re.compile(r"^#{1,6}\s+\S", re.MULTILINE) + +# Title Case: starts with capital, no terminal punctuation at end of line, +# short (≤ 80 chars), and at least half the words are Title-cased. +_TITLECASE_RE = re.compile(r"^[A-Z][^\n]{0,78}$") +_TERMINAL_PUNCT_RE = re.compile(r"[.!?;,]$") + +# ALL_CAPS line: all uppercase letters / spaces / digits +_ALLCAPS_RE = re.compile(r"^[A-Z0-9][A-Z0-9 \t\-:]+$") + +# Blank-line boundary: a line that is non-empty, short, and preceded/followed +# by blank lines (handled at section-split level, not a single regex). +_SHORT_LINE_MAX = 80 + + +def _is_title_case(line: str) -> bool: + """Return True when a line looks like a Title Case heading.""" + if not _TITLECASE_RE.match(line): + return False + if _TERMINAL_PUNCT_RE.search(line): + return False + words = line.split() + if len(words) == 0: + return False + capitalised = sum(1 for w in words if w and w[0].isupper()) + return capitalised / len(words) >= 0.5 + + +def _is_allcaps(line: str) -> bool: + """Return True when a line is ALL CAPS without terminal punctuation.""" + if not _ALLCAPS_RE.match(line): + return False + return not _TERMINAL_PUNCT_RE.search(line) + + +def _split_at_markdown_headers(text: str) -> list[str]: + """Split *text* into sections at Markdown ATX header lines (``#``, ``##``, …).""" + # We split before each header line so that the header stays with its section. + lines = text.splitlines(keepends=True) + sections: list[str] = [] + current: list[str] = [] + for line in lines: + if re.match(r"^#{1,6}\s+\S", line) and current: + sections.append("".join(current)) + current = [line] + else: + current.append(line) + if current: + sections.append("".join(current)) + return [s for s in sections if s.strip()] + + +def _split_at_capitalization(text: str) -> list[str]: + """Split *text* into sections at Title Case or ALL_CAPS heading lines.""" + lines = text.splitlines(keepends=True) + sections: list[str] = [] + current: list[str] = [] + for line in lines: + stripped = line.rstrip("\r\n") + is_header = ( + len(stripped) <= _SHORT_LINE_MAX + and stripped.strip() + and (_is_title_case(stripped.strip()) or _is_allcaps(stripped.strip())) + ) + if is_header and current: + sections.append("".join(current)) + current = [line] + else: + current.append(line) + if current: + sections.append("".join(current)) + return [s for s in sections if s.strip()] + + +def _split_at_blank_line(text: str) -> list[str]: + """Split *text* into sections at short lines surrounded by blank lines. + + A line qualifies as a section header when it: + + * Is non-empty and at most 80 characters. + * Contains no terminal punctuation (not a regular sentence). + * Has no more than 6 words (avoids treating body sentences as headers). + * Is preceded by a blank line (or is the very first non-blank line). + * Is followed by a blank line. + """ + lines = text.splitlines(keepends=True) + n = len(lines) + sections: list[str] = [] + current: list[str] = [] + + i = 0 + while i < n: + line = lines[i] + stripped = line.rstrip("\r\n").strip() + # A "blank-line boundary" header: non-empty, short, preceded by blank, + # followed by blank, few words, no terminal punctuation. + prev_blank = (i == 0) or (not lines[i - 1].strip()) + next_blank = (i + 1 >= n) or (not lines[i + 1].strip()) + word_count = len(stripped.split()) if stripped else 0 + is_header = ( + stripped + and len(stripped) <= _SHORT_LINE_MAX + and not _TERMINAL_PUNCT_RE.search(stripped) + and word_count <= 6 + and prev_blank + and next_blank + ) + if is_header and current: + sections.append("".join(current)) + current = [line] + else: + current.append(line) + i += 1 + + if current: + sections.append("".join(current)) + return [s for s in sections if s.strip()] + + +def _split_at_spacy_verbless(text: str, nlp: Any) -> list[str]: + """Split *text* into sections at SpaCy-detected verbless heading sentences. + + A sentence qualifies as a heading when it: + + * Has at most 80 characters. + * Contains no verb token (POS tag ``VERB`` or ``AUX``). + * Is immediately followed by a longer sentence (> 80 chars). + """ + doc = nlp(text) + sentences = list(doc.sents) + if not sentences: + return [text] if text.strip() else [] + + # Identify which sentences are "headers". + is_header = [False] * len(sentences) + for idx, sent in enumerate(sentences): + sent_text = sent.text.strip() + if len(sent_text) > _SHORT_LINE_MAX: + continue + has_verb = any(tok.pos_ in ("VERB", "AUX") for tok in sent) + if has_verb: + continue + # Must be followed by a longer sentence. + if ( + idx + 1 < len(sentences) + and len(sentences[idx + 1].text.strip()) > _SHORT_LINE_MAX + ): + is_header[idx] = True + + # Build sections: split before each header sentence (except the first). + sections: list[str] = [] + current_parts: list[str] = [] + for idx, sent in enumerate(sentences): + if is_header[idx] and current_parts: + sections.append(" ".join(current_parts)) + current_parts = [sent.text] + else: + current_parts.append(sent.text) + + if current_parts: + sections.append(" ".join(current_parts)) + + return [s for s in sections if s.strip()] + + +# --------------------------------------------------------------------------- +# Overlap-based character splitter (for sections larger than max_chunk_size) +# --------------------------------------------------------------------------- + + +def _split_with_overlap(text: str, max_size: int, overlap: int) -> list[str]: + """Split *text* into character-level chunks of at most *max_size* with + *overlap* characters carried over from the previous chunk. + + Splits are attempted at whitespace boundaries to avoid cutting words. + """ + if not text: + return [] + chunks: list[str] = [] + start = 0 + length = len(text) + step = max(1, max_size - overlap) + + while start < length: + end = min(start + max_size, length) + chunk = text[start:end] + + # Prefer to cut at a whitespace boundary when not at end of text. + if end < length: + # Walk backwards to find a space. + cut = end + while cut > start and not text[cut - 1].isspace(): + cut -= 1 + if cut > start: + end = cut + chunk = text[start:end] + + chunks.append(chunk) + + # Advance by step, ensuring we always make progress. + next_start = start + step + if next_start <= start: + next_start = start + 1 + start = next_start + + return chunks + + +# --------------------------------------------------------------------------- +# Verb-filter using SpaCy +# --------------------------------------------------------------------------- + + +def _drop_verbless_sentences(text: str, nlp: Any) -> str: + """Remove sentences with no verb token from *text* using SpaCy. + + A sentence is considered *verbless* when it contains no token whose + part-of-speech tag is ``VERB`` or ``AUX``. + """ + doc = nlp(text) + kept: list[str] = [] + for sent in doc.sents: + has_verb = any(tok.pos_ in ("VERB", "AUX") for tok in sent) + if has_verb: + kept.append(sent.text) + return " ".join(kept) + + +# --------------------------------------------------------------------------- +# Valid strategy names +# --------------------------------------------------------------------------- + +_VALID_STRATEGIES = frozenset( + {"markdown", "capitalization", "blank_line", "spacy_verbless"} +) + + +# --------------------------------------------------------------------------- +# Main component +# --------------------------------------------------------------------------- + + +class HierarchicalTextSplitter(TextSplitter): + """Splits text by first detecting section boundaries then chunking each section. + + Args: + max_chunk_size (int): Maximum number of characters per output chunk. + Defaults to 2048. + chunk_overlap (int): Characters of overlap between consecutive chunks + when a section must be further split. Must be less than + ``max_chunk_size``. Defaults to 200. + header_strategy (str): How to detect section boundaries. One of: + + * ``"markdown"`` — Markdown ATX header lines (``#``, ``##``, …). + * ``"capitalization"`` — short Title Case or ALL_CAPS lines without + terminal punctuation. + * ``"blank_line"`` — short lines surrounded by blank lines on both + sides. + * ``"spacy_verbless"`` — SpaCy-detected short verbless sentences + that precede a longer sentence. + + model (str): SpaCy model name loaded when *header_strategy* is + ``"spacy_verbless"`` or *drop_verbless_sentences* is ``True``. + Defaults to ``"en_core_web_sm"``. + drop_verbless_sentences (bool): When ``True`` (default), SpaCy is used + to remove verbless sentences from every emitted chunk. Note that + this default value causes SpaCy to be loaded at construction time + regardless of the chosen *header_strategy* — install + ``neo4j-graphrag[nlp]`` when using the default, or explicitly pass + ``drop_verbless_sentences=False`` to avoid the SpaCy dependency. + + Example: + + .. code-block:: python + + from neo4j_graphrag.experimental.components.text_splitters.hierarchical_splitter import ( + HierarchicalTextSplitter, + ) + from neo4j_graphrag.experimental.pipeline import Pipeline + + pipeline = Pipeline() + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=200, + header_strategy="markdown", + ) + pipeline.add_component(splitter, "text_splitter") + """ + + @validate_call + def __init__( + self, + max_chunk_size: int = 2048, + chunk_overlap: int = 200, + header_strategy: str = "markdown", + model: str = "en_core_web_sm", + drop_verbless_sentences: bool = True, + ) -> None: + if max_chunk_size <= 0: + raise ValueError("max_chunk_size must be strictly greater than 0") + if chunk_overlap < 0: + raise ValueError("chunk_overlap must be >= 0") + if chunk_overlap >= max_chunk_size: + raise ValueError("chunk_overlap must be strictly less than max_chunk_size") + if header_strategy not in _VALID_STRATEGIES: + raise ValueError( + f"header_strategy must be one of {sorted(_VALID_STRATEGIES)}, " + f"got {header_strategy!r}" + ) + + self.max_chunk_size = max_chunk_size + self.chunk_overlap = chunk_overlap + self.header_strategy = header_strategy + self.model = model + self.drop_verbless_sentences = drop_verbless_sentences + + # Pre-load SpaCy only when needed. + self._nlp: Optional[Any] = None + needs_spacy = header_strategy == "spacy_verbless" or drop_verbless_sentences + if needs_spacy: + self._nlp = self._load_spacy(model) + + @staticmethod + def _load_spacy(model: str) -> Any: + """Load a SpaCy model, raising a clear error when SpaCy is missing.""" + try: + import spacy # noqa: PLC0415 + except ImportError as exc: + raise ImportError( + "SpaCy is required for this configuration of HierarchicalTextSplitter. " + "Install it with: pip install 'neo4j-graphrag[nlp]'" + ) from exc + try: + return spacy.load(model) + except OSError as exc: + raise ValueError( + f"SpaCy model {model!r} is not installed. " + f"Download it with: python -m spacy download {model}" + ) from exc + + def _detect_sections(self, text: str) -> list[str]: + """Detect section boundaries and return a list of section strings.""" + strategy = self.header_strategy + if strategy == "markdown": + sections = _split_at_markdown_headers(text) + elif strategy == "capitalization": + sections = _split_at_capitalization(text) + elif strategy == "blank_line": + sections = _split_at_blank_line(text) + else: # "spacy_verbless" + if self._nlp is None: + raise RuntimeError( + "SpaCy model not loaded for 'spacy_verbless' strategy; this is a bug" + ) + sections = _split_at_spacy_verbless(text, self._nlp) + + # Fallback: if no sections were detected, treat the whole text as one. + if not sections: + sections = [text] if text.strip() else [] + return sections + + def _chunk_section(self, section_text: str) -> list[str]: + """Return one or more raw text chunks for a single *section_text*.""" + if len(section_text) <= self.max_chunk_size: + return [section_text] + return _split_with_overlap( + section_text, self.max_chunk_size, self.chunk_overlap + ) + + def _filter_verbless(self, text: str) -> str: + """Apply the verbless-sentence filter if enabled.""" + if not self.drop_verbless_sentences or self._nlp is None: + return text + filtered = _drop_verbless_sentences(text, self._nlp) + # Fall back to original text when filtering removes everything. + return filtered if filtered.strip() else text + + @validate_call + async def run(self, text: str) -> TextChunks: + """Split *text* into hierarchical chunks. + + Args: + text (str): The text to be split. + + Returns: + TextChunks: A list of chunks with sequential index values starting + from 0. + """ + if not text.strip(): + return TextChunks(chunks=[]) + + sections = self._detect_sections(text) + raw_chunks: list[str] = [] + for section in sections: + raw_chunks.extend(self._chunk_section(section)) + + chunks: list[TextChunk] = [] + for raw in raw_chunks: + filtered = self._filter_verbless(raw) + if filtered.strip(): + chunks.append(TextChunk(text=filtered, index=len(chunks))) + + return TextChunks(chunks=chunks) diff --git a/tests/unit/experimental/components/text_splitters/test_hierarchical_splitter.py b/tests/unit/experimental/components/text_splitters/test_hierarchical_splitter.py new file mode 100644 index 000000000..ba06e02d6 --- /dev/null +++ b/tests/unit/experimental/components/text_splitters/test_hierarchical_splitter.py @@ -0,0 +1,524 @@ +# Copyright (c) "Neo4j" +# Neo4j Sweden AB [https://neo4j.com] +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# https://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for HierarchicalTextSplitter. + +SpaCy is not downloaded in these tests. Where the splitter would normally +load a model (`drop_verbless_sentences=True` or `header_strategy="spacy_verbless"`), +the tests patch `spacy.load` with a lightweight fake nlp object so that no +network access or model installation is required. +""" + +from __future__ import annotations + +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +# Skip the entire module when spaCy is not installed at all. +spacy = pytest.importorskip("spacy") + +from neo4j_graphrag.experimental.components.text_splitters.hierarchical_splitter import ( # noqa: E402 + HierarchicalTextSplitter, +) + + +# --------------------------------------------------------------------------- +# Helpers for building fake SpaCy objects without a real model +# --------------------------------------------------------------------------- + + +def _make_fake_token(text: str, pos: str) -> MagicMock: + """Return a MagicMock that looks like a spaCy Token.""" + tok = MagicMock() + tok.text = text + tok.pos_ = pos + return tok + + +def _make_fake_sent(text: str, tokens: list[MagicMock]) -> MagicMock: + """Return a MagicMock that looks like a spaCy Span (sentence). + + Uses ``side_effect`` instead of ``return_value`` so that each call to + ``iter(sent)`` creates a *fresh* iterator — important if the sentence is + iterated more than once (e.g. across multiple ``run()`` calls or in future + multi-pass tests). + """ + sent = MagicMock() + sent.text = text + sent.__iter__ = MagicMock(side_effect=lambda: iter(tokens)) + return sent + + +def _make_fake_doc(sentences: list[MagicMock]) -> MagicMock: + """Return a MagicMock that looks like a spaCy Doc with .sents.""" + doc = MagicMock() + doc.sents = sentences + doc.__iter__ = MagicMock(return_value=iter([])) + return doc + + +def _make_nlp_returning_doc(sentences: list[MagicMock]) -> MagicMock: + """Return a callable MagicMock that acts as a spaCy nlp() pipeline.""" + nlp = MagicMock() + nlp.return_value = _make_fake_doc(sentences) + return nlp + + +# --------------------------------------------------------------------------- +# Tests: header_strategy="markdown" +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_markdown_two_sections_produce_two_chunks() -> None: + """Two Markdown sections produce exactly two chunks.""" + text = "# Introduction\nThis is the intro section.\n# Conclusion\nThis is the conclusion." + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="markdown", + drop_verbless_sentences=False, + ) + result = await splitter.run(text) + assert len(result.chunks) == 2 + assert "Introduction" in result.chunks[0].text + assert "Conclusion" in result.chunks[1].text + + +@pytest.mark.asyncio +async def test_markdown_single_section_produces_one_chunk() -> None: + """Text with no Markdown headers is treated as one section.""" + text = "No headers here. Just a single paragraph of text." + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="markdown", + drop_verbless_sentences=False, + ) + result = await splitter.run(text) + assert len(result.chunks) == 1 + + +@pytest.mark.asyncio +async def test_markdown_three_sections_sequential_indices() -> None: + """Three Markdown sections produce chunks with sequential indices 0, 1, 2.""" + text = "# A\nSection A body.\n# B\nSection B body.\n# C\nSection C body." + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="markdown", + drop_verbless_sentences=False, + ) + result = await splitter.run(text) + assert len(result.chunks) == 3 + for i, chunk in enumerate(result.chunks): + assert chunk.index == i + + +# --------------------------------------------------------------------------- +# Tests: header_strategy="capitalization" +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_capitalization_allcaps_header_splits() -> None: + """ALL_CAPS lines without terminal punctuation act as section headers.""" + text = "INTRODUCTION\nThis paragraph describes the introduction.\nCONCLUSION\nThis paragraph wraps things up." + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="capitalization", + drop_verbless_sentences=False, + ) + result = await splitter.run(text) + assert len(result.chunks) == 2 + assert "INTRODUCTION" in result.chunks[0].text + assert "CONCLUSION" in result.chunks[1].text + + +@pytest.mark.asyncio +async def test_capitalization_title_case_header_splits() -> None: + """Title Case lines without terminal punctuation act as section headers.""" + text = "First Section Title\nContent for the first section.\nSecond Section Title\nContent for the second section." + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="capitalization", + drop_verbless_sentences=False, + ) + result = await splitter.run(text) + assert len(result.chunks) == 2 + + +@pytest.mark.asyncio +async def test_capitalization_indices_sequential() -> None: + """Chunk indices are sequential starting from 0.""" + text = "PART ONE\nBody of part one.\nPART TWO\nBody of part two.\nPART THREE\nBody of part three." + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="capitalization", + drop_verbless_sentences=False, + ) + result = await splitter.run(text) + for i, chunk in enumerate(result.chunks): + assert chunk.index == i, f"chunk {i} has index {chunk.index}" + + +# --------------------------------------------------------------------------- +# Tests: header_strategy="blank_line" +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_blank_line_short_surrounded_header_splits() -> None: + """Short lines surrounded by blank lines act as section headers.""" + text = "\nOverview\n\nThis section covers the overview of the system.\n\nDetails\n\nThis section covers the fine-grained details." + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="blank_line", + drop_verbless_sentences=False, + ) + result = await splitter.run(text) + assert len(result.chunks) == 2 + assert "Overview" in result.chunks[0].text + assert "Details" in result.chunks[1].text + + +@pytest.mark.asyncio +async def test_blank_line_indices_sequential() -> None: + """Chunk indices are sequential starting from 0 for blank_line strategy.""" + text = "\nPart A\n\nContent for part A goes here.\n\nPart B\n\nContent for part B goes here." + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="blank_line", + drop_verbless_sentences=False, + ) + result = await splitter.run(text) + for i, chunk in enumerate(result.chunks): + assert chunk.index == i + + +# --------------------------------------------------------------------------- +# Tests: overlap splitting for large sections +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_large_section_split_with_overlap() -> None: + """A section larger than max_chunk_size is split into multiple chunks + with the last N characters of chunk K equal to the first N characters + of chunk K+1. + + Strategy: use no header strategy (treat whole text as one section) via a + single Markdown section; make the body very long and max_chunk_size small + relative to the body so that we get several chunks all in the body content, + and the overlap check applies to non-header chunks. + """ + # The section body is pure 'x' characters with no whitespace. + # With max_size=50, overlap=20, step=30: + # chunk 0: body[0:50] = 'x'*50 + # chunk 1: body[30:80] = 'x'*50, so overlap = body[30:50] = 'x'*20 + # We avoid the Markdown header prefix eating into the overlap window by + # using a plain-text body (no Markdown header at all, so the whole text + # is one section) and using drop_verbless_sentences=False. + body = "x" * 500 + overlap = 20 + max_size = 50 + splitter = HierarchicalTextSplitter( + max_chunk_size=max_size, + chunk_overlap=overlap, + header_strategy="markdown", + drop_verbless_sentences=False, + ) + result = await splitter.run(body) + chunks = result.chunks + assert len(chunks) > 1 + + # Verify overlap: last `overlap` chars of chunk[k] == first `overlap` chars of chunk[k+1]. + for k in range(len(chunks) - 1): + tail = chunks[k].text[-overlap:] + head = chunks[k + 1].text[:overlap] + assert tail == head, ( + f"Overlap mismatch between chunk {k} and {k + 1}: " + f"tail={tail!r}, head={head!r}" + ) + + +@pytest.mark.asyncio +async def test_small_section_emitted_as_single_chunk() -> None: + """A section shorter than max_chunk_size is emitted as a single chunk.""" + text = "# Tiny\nShort body." + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="markdown", + drop_verbless_sentences=False, + ) + result = await splitter.run(text) + assert len(result.chunks) == 1 + assert "Tiny" in result.chunks[0].text + assert "Short body" in result.chunks[0].text + + +@pytest.mark.asyncio +async def test_overlap_chunk_indices_sequential() -> None: + """Indices remain sequential when a section is split due to size.""" + body = "y" * 300 + text = f"# Big Section\n{body}" + splitter = HierarchicalTextSplitter( + max_chunk_size=60, + chunk_overlap=10, + header_strategy="markdown", + drop_verbless_sentences=False, + ) + result = await splitter.run(text) + assert len(result.chunks) > 1 + for i, chunk in enumerate(result.chunks): + assert chunk.index == i + + +# --------------------------------------------------------------------------- +# Tests: drop_verbless_sentences=True (SpaCy mocked) +# --------------------------------------------------------------------------- + + +def _make_spacy_nlp_with_verbless() -> MagicMock: + """Return a fake nlp() that drops verbless sentences when called. + + Sentence 1 (verbless): "No verb here" — tokens have no VERB/AUX. + Sentence 2 (with verb): "The dog runs fast" — one VERB token. + """ + # Tokens for sentence 1 (verbless: no VERB/AUX tags) + sent1_tokens = [ + _make_fake_token("No", "DET"), + _make_fake_token("verb", "NOUN"), + _make_fake_token("here", "ADV"), + ] + sent1 = _make_fake_sent("No verb here", sent1_tokens) + + # Tokens for sentence 2 (contains a VERB) + sent2_tokens = [ + _make_fake_token("The", "DET"), + _make_fake_token("dog", "NOUN"), + _make_fake_token("runs", "VERB"), + _make_fake_token("fast", "ADV"), + ] + sent2 = _make_fake_sent("The dog runs fast", sent2_tokens) + + return _make_nlp_returning_doc([sent1, sent2]) + + +@pytest.mark.asyncio +async def test_drop_verbless_removes_verbless_sentence() -> None: + """When drop_verbless_sentences=True, sentences with no verb are removed.""" + fake_nlp = _make_spacy_nlp_with_verbless() + + with patch("spacy.load", return_value=fake_nlp): + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="markdown", + drop_verbless_sentences=True, + model="en_core_web_sm", + ) + + text = "# Section\nNo verb here. The dog runs fast." + result = await splitter.run(text) + + assert len(result.chunks) == 1 + chunk_text = result.chunks[0].text + # Verbless sentence should be gone; verbal sentence should remain. + assert "The dog runs fast" in chunk_text + assert "No verb here" not in chunk_text + + +@pytest.mark.asyncio +async def test_drop_verbless_keeps_verbal_sentences() -> None: + """When all sentences contain a verb, no text is removed.""" + sent_tokens = [ + _make_fake_token("She", "PRON"), + _make_fake_token("walks", "VERB"), + _make_fake_token("home", "NOUN"), + ] + sent = _make_fake_sent("She walks home", sent_tokens) + fake_nlp = _make_nlp_returning_doc([sent]) + + with patch("spacy.load", return_value=fake_nlp): + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="markdown", + drop_verbless_sentences=True, + model="en_core_web_sm", + ) + + text = "# Section\nShe walks home." + result = await splitter.run(text) + + assert len(result.chunks) == 1 + assert "She walks home" in result.chunks[0].text + + +# --------------------------------------------------------------------------- +# Tests: chunk index sequencing across multiple sections +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_indices_sequential_across_multiple_sections() -> None: + """Indices are global — they continue from the last chunk of the previous section.""" + text = ( + "# Alpha\nFirst section body.\n" + "# Beta\nSecond section body.\n" + "# Gamma\nThird section body." + ) + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="markdown", + drop_verbless_sentences=False, + ) + result = await splitter.run(text) + assert len(result.chunks) == 3 + for expected_index, chunk in enumerate(result.chunks): + assert chunk.index == expected_index + + +# --------------------------------------------------------------------------- +# Tests: header_strategy="spacy_verbless" +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_spacy_verbless_strategy_splits_at_verbless_heading() -> None: + """header_strategy="spacy_verbless" uses SpaCy to detect verbless headings. + + A short sentence with no verb that precedes a longer sentence is treated as + a section heading and causes a split. This test mocks the nlp pipeline so + no model is downloaded. + """ + # Sentence 1 (verbless heading, short ≤ 80 chars): "Introduction" + sent1_tokens = [_make_fake_token("Introduction", "NOUN")] + sent1 = _make_fake_sent("Introduction", sent1_tokens) + + # Sentence 2 (long body sentence, > 80 chars, contains VERB): + long_body = "This section covers all the foundational concepts you need to understand before proceeding further." + sent2_tokens = [ + _make_fake_token("This", "DET"), + _make_fake_token("section", "NOUN"), + _make_fake_token("covers", "VERB"), + ] + sent2 = _make_fake_sent(long_body, sent2_tokens) + + # Sentence 3 (verbless heading): "Conclusion" + sent3_tokens = [_make_fake_token("Conclusion", "NOUN")] + sent3 = _make_fake_sent("Conclusion", sent3_tokens) + + # Sentence 4 (another long body): + long_body2 = "This final section wraps up all the topics discussed and provides closing remarks for the reader." + sent4_tokens = [ + _make_fake_token("This", "DET"), + _make_fake_token("section", "NOUN"), + _make_fake_token("wraps", "VERB"), + ] + sent4 = _make_fake_sent(long_body2, sent4_tokens) + + fake_nlp = _make_nlp_returning_doc([sent1, sent2, sent3, sent4]) + + with patch("spacy.load", return_value=fake_nlp): + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="spacy_verbless", + drop_verbless_sentences=False, + model="en_core_web_sm", + ) + + # Use arbitrary text — the nlp mock controls what sents are returned. + text = "Introduction. " + long_body + " Conclusion. " + long_body2 + result = await splitter.run(text) + + # Two verbless headings (sent1, sent3) each trigger a split → 2 sections. + assert len(result.chunks) == 2 + assert result.chunks[0].index == 0 + assert result.chunks[1].index == 1 + + +# --------------------------------------------------------------------------- +# Tests: edge cases +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_empty_text_returns_no_chunks() -> None: + """Empty input produces an empty chunk list.""" + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="markdown", + drop_verbless_sentences=False, + ) + result = await splitter.run("") + assert result.chunks == [] + + +@pytest.mark.asyncio +async def test_whitespace_only_returns_no_chunks() -> None: + """Whitespace-only input produces an empty chunk list.""" + splitter = HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="markdown", + drop_verbless_sentences=False, + ) + result = await splitter.run(" \n\t\n ") + assert result.chunks == [] + + +def test_invalid_header_strategy_raises() -> None: + """An unrecognised header_strategy raises ValueError at construction time.""" + with pytest.raises(ValueError, match="header_strategy must be one of"): + HierarchicalTextSplitter( + max_chunk_size=2048, + chunk_overlap=0, + header_strategy="unknown_strategy", + drop_verbless_sentences=False, + ) + + +def test_chunk_overlap_ge_max_chunk_size_raises() -> None: + """chunk_overlap >= max_chunk_size raises ValueError.""" + with pytest.raises(ValueError, match="chunk_overlap must be strictly less than max_chunk_size"): + HierarchicalTextSplitter( + max_chunk_size=100, + chunk_overlap=100, + header_strategy="markdown", + drop_verbless_sentences=False, + ) + + +def test_max_chunk_size_zero_raises() -> None: + """max_chunk_size=0 raises ValueError.""" + with pytest.raises(ValueError, match="max_chunk_size must be strictly greater than 0"): + HierarchicalTextSplitter( + max_chunk_size=0, + chunk_overlap=0, + header_strategy="markdown", + drop_verbless_sentences=False, + ) diff --git a/tests/unit/experimental/components/text_splitters/test_hierarchical_splitter_integration.py b/tests/unit/experimental/components/text_splitters/test_hierarchical_splitter_integration.py new file mode 100644 index 000000000..e2693880f --- /dev/null +++ b/tests/unit/experimental/components/text_splitters/test_hierarchical_splitter_integration.py @@ -0,0 +1,99 @@ +# Copyright (c) "Neo4j" +# Neo4j Sweden AB [https://neo4j.com] +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# https://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Integration tests for HierarchicalTextSplitter using the real en_core_web_sm model. + +All tests in this module are automatically skipped when either spaCy or the +``en_core_web_sm`` model is not installed. No mocking is performed — these +tests exercise the full NLP pipeline end-to-end. +""" + +from __future__ import annotations + +import pytest + +spacy = pytest.importorskip("spacy") + + +@pytest.fixture(scope="module") +def nlp(): # type: ignore[return] + try: + return spacy.load("en_core_web_sm") + except OSError: + pytest.skip( + "en_core_web_sm not installed — run: python -m spacy download en_core_web_sm" + ) + + +@pytest.mark.asyncio +async def test_markdown_split_real(nlp) -> None: # noqa: ANN001 + """HierarchicalTextSplitter with markdown strategy returns sequential chunks for a 3-section doc.""" + from neo4j_graphrag.experimental.components.text_splitters.hierarchical_splitter import ( + HierarchicalTextSplitter, + ) + + text = ( + "# Introduction\n" + "This section introduces the topic and provides background information.\n\n" + "# Methods\n" + "This section describes the experimental methods used in the study.\n\n" + "# Conclusion\n" + "This section summarises the findings and suggests future work.\n" + ) + + splitter = HierarchicalTextSplitter( + header_strategy="markdown", + max_chunk_size=200, + chunk_overlap=20, + drop_verbless_sentences=False, + ) + result = await splitter.run(text) + + assert len(result.chunks) >= 2, ( + f"Expected at least 2 chunks for a 3-section markdown doc, got {len(result.chunks)}" + ) + # Indices must be sequential starting from 0. + for i, chunk in enumerate(result.chunks): + assert chunk.index == i, f"chunk {i} has non-sequential index {chunk.index}" + + +@pytest.mark.asyncio +async def test_drop_verbless_sentences_real(nlp) -> None: # noqa: ANN001 + """drop_verbless_sentences=True drops verbless fragments using the real SpaCy model.""" + from neo4j_graphrag.experimental.components.text_splitters.hierarchical_splitter import ( + HierarchicalTextSplitter, + ) + + # "Overview" is a single-word verbless fragment. + # The second sentence contains a real verb ("covers"). + text = "Overview\n\nThis section covers the main concepts of the system in detail." + + splitter = HierarchicalTextSplitter( + header_strategy="blank_line", + max_chunk_size=500, + chunk_overlap=0, + drop_verbless_sentences=True, + model="en_core_web_sm", + ) + result = await splitter.run(text) + + # At least one chunk must survive. + assert len(result.chunks) >= 1, "Expected at least one chunk after filtering" + + # The verbless word "Overview" should not appear as a standalone sentence + # in any chunk, while the main sentence content should be present. + all_text = " ".join(chunk.text for chunk in result.chunks) + assert "covers" in all_text, ( + "Expected the verbal sentence to survive verbless-sentence filtering" + )