Skip to content

Commit 0ded3cc

Browse files
Enhances documentation across UDS modules, including detailed descriptions for the UDS corpus, annotation, and metadata classes. Refines type hints for improved clarity and consistency, ensuring better type safety throughout the UDS annotation system. Updates method signatures and docstrings to reflect changes, enhancing usability for developers working with UDS datasets.
1 parent 70e019f commit 0ded3cc

File tree

6 files changed

+548
-321
lines changed

6 files changed

+548
-321
lines changed

decomp/corpus/corpus.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,56 @@
1-
"""Module for defining abstract graph corpus readers"""
1+
"""Abstract base class for graph corpus readers.
2+
3+
This module provides the foundational :class:`Corpus` class for managing collections
4+
of graphs in the decomp framework. The Corpus class serves as an abstract base that
5+
concrete corpus implementations extend to handle specific graph formats.
6+
7+
The module defines a generic corpus container that:
8+
- Accepts raw graphs in an input format
9+
- Transforms them to an output format via an abstract graph builder
10+
- Provides dictionary-like access to the processed graphs
11+
- Handles errors during graph construction gracefully
12+
13+
Type Variables
14+
--------------
15+
InGraph
16+
The input graph type that will be processed by the corpus reader.
17+
18+
OutGraph
19+
The output graph type produced after processing.
20+
21+
Type Aliases
22+
------------
23+
GraphDict[T]
24+
Generic dictionary mapping hashable identifiers to graphs of type T.
25+
26+
Classes
27+
-------
28+
Corpus
29+
Abstract base class for graph corpus containers with generic type parameters
30+
for input and output graph formats.
31+
"""
232

333
from abc import ABCMeta, abstractmethod
434
from collections.abc import Hashable, ItemsView, Iterator
535
from logging import warning
636
from random import sample
7-
from typing import Generic, TypeAlias, TypeVar
37+
from typing import TypeVar
838

939

1040
InGraph = TypeVar('InGraph') # the input graph type
1141
OutGraph = TypeVar('OutGraph') # the output graph type
1242

13-
GraphDict: TypeAlias = dict[Hashable, OutGraph]
43+
type GraphDict[T] = dict[Hashable, T]
1444

1545

16-
class Corpus(Generic[InGraph, OutGraph], metaclass=ABCMeta):
17-
"""Container for graphs
46+
class Corpus[InGraph, OutGraph](metaclass=ABCMeta):
47+
"""Container for graphs.
1848
1949
Parameters
2050
----------
2151
graphs_raw
22-
a sequence of graphs in a format that the graphbuilder for a
23-
subclass of this abstract class can process
52+
A sequence of graphs in a format that the graphbuilder for a
53+
subclass of this abstract class can process.
2454
"""
2555

2656
def __init__(self, graphs_raw: dict[Hashable, InGraph]):
@@ -32,7 +62,7 @@ def __iter__(self) -> Iterator[Hashable]:
3262
return iter(self._graphs)
3363

3464
def items(self) -> ItemsView[Hashable, OutGraph]:
35-
"""Dictionary-like iterator for (graphid, graph) pairs"""
65+
"""Dictionary-like iterator for (graphid, graph) pairs."""
3666
return self._graphs.items()
3767

3868
def __getitem__(self, k: Hashable) -> OutGraph:
@@ -56,14 +86,16 @@ def _build_graphs(self) -> None:
5686
warning(f'{graphid} has loops')
5787

5888
@abstractmethod
59-
def _graphbuilder(self,
60-
graphid: Hashable,
61-
rawgraph: InGraph) -> OutGraph:
89+
def _graphbuilder(
90+
self,
91+
graphid: Hashable,
92+
rawgraph: InGraph
93+
) -> OutGraph:
6294
raise NotImplementedError
6395

6496
@property
6597
def graphs(self) -> dict[Hashable, OutGraph]:
66-
"""The graphs in corpus"""
98+
"""The graphs in corpus."""
6799
return self._graphs
68100

69101
@property

decomp/semantics/uds/__init__.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,52 @@
1-
"""Module for representing UDS corpora, documents, graphs, and annotations."""
1+
"""Universal Decompositional Semantics (UDS) representation framework.
2+
3+
This module provides a comprehensive framework for working with Universal Decompositional
4+
Semantics (UDS) datasets. UDS is a semantic annotation framework that captures diverse
5+
semantic properties of natural language texts through real-valued annotations on
6+
predicate-argument structures.
7+
8+
The module is organized hierarchically:
9+
10+
- **Annotations** (:mod:`~decomp.semantics.uds.annotation`): Provides classes for handling
11+
UDS property annotations in both raw (multi-annotator) and normalized (aggregated) formats.
12+
13+
- **Graphs** (:mod:`~decomp.semantics.uds.graph`): Implements graph representations at
14+
sentence and document levels, integrating syntactic dependency structures with semantic
15+
annotations.
16+
17+
- **Documents** (:mod:`~decomp.semantics.uds.document`): Represents complete documents
18+
containing multiple sentences with their associated graphs and metadata.
19+
20+
- **Corpus** (:mod:`~decomp.semantics.uds.corpus`): Manages collections of UDS documents
21+
and provides functionality for loading, querying, and serializing UDS datasets.
22+
23+
Classes
24+
-------
25+
NormalizedUDSAnnotation
26+
Annotations with aggregated values and confidence scores from multiple annotators.
27+
28+
RawUDSAnnotation
29+
Annotations preserving individual annotator responses before aggregation.
30+
31+
UDSSentenceGraph
32+
Graph representation of a single sentence with syntax and semantics layers.
33+
34+
UDSDocumentGraph
35+
Graph connecting multiple sentence graphs within a document.
36+
37+
UDSDocument
38+
Container for sentence graphs and document-level annotations.
39+
40+
UDSCorpus
41+
Collection of UDS documents with support for various data formats and queries.
42+
43+
Notes
44+
-----
45+
The UDS framework builds upon the PredPatt system for extracting predicate-argument
46+
structures and extends it with rich semantic annotations. All graph representations
47+
use NetworkX for the underlying graph structure and support SPARQL queries via RDF
48+
conversion.
49+
"""
250

351
from .annotation import NormalizedUDSAnnotation, RawUDSAnnotation
452
from .corpus import UDSCorpus

decomp/semantics/uds/annotation.py

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from overrides import overrides
2323

2424
from .metadata import PrimitiveType, UDSAnnotationMetadata, UDSPropertyMetadata
25-
from .types import AnnotatorValue as TypedAnnotatorValue
25+
from .types import AnnotatorValue as TypedAnnotatorValue, UDSSubspace
2626

2727

2828
# type aliases for annotation data structures
@@ -209,11 +209,13 @@ def _process_node_data(self, data: dict[str, dict[str, NormalizedData | RawData]
209209

210210
# Some attributes are not property subspaces and are thus excluded
211211
self._excluded_attributes = {'subpredof', 'subargof', 'headof', 'span', 'head'}
212-
self._node_subspaces = {ss for gid, nodedict
213-
in self._node_attributes.items()
214-
for nid, subspaces in nodedict.items()
215-
for ss in subspaces}
216-
self._node_subspaces = self._node_subspaces - self._excluded_attributes
212+
self._node_subspaces: set[UDSSubspace] = {
213+
cast(UDSSubspace, ss) for gid, nodedict
214+
in self._node_attributes.items()
215+
for nid, subspaces in nodedict.items()
216+
for ss in subspaces
217+
if ss not in self._excluded_attributes
218+
}
217219

218220
def _process_edge_data(self, data: dict[str, dict[str, NormalizedData | RawData]]) -> None:
219221
"""Extract edge attributes from annotation data.
@@ -231,10 +233,12 @@ def _process_edge_data(self, data: dict[str, dict[str, NormalizedData | RawData]
231233
if '%%' in edge}
232234
for gid, attrs in data.items()}
233235

234-
self._edge_subspaces = {ss for gid, edgedict
235-
in self._edge_attributes.items()
236-
for eid, subspaces in edgedict.items()
237-
for ss in subspaces}
236+
self._edge_subspaces: set[UDSSubspace] = {
237+
cast(UDSSubspace, ss) for gid, edgedict
238+
in self._edge_attributes.items()
239+
for eid, subspaces in edgedict.items()
240+
for ss in subspaces
241+
}
238242

239243
def _validate(self) -> None:
240244
"""Validate annotation data consistency.
@@ -454,39 +458,39 @@ def metadata(self) -> UDSAnnotationMetadata:
454458
return self._metadata
455459

456460
@property
457-
def node_subspaces(self) -> set[str]:
461+
def node_subspaces(self) -> set[UDSSubspace]:
458462
"""Set of subspaces used in node annotations.
459463
460464
Returns
461465
-------
462-
set[str]
466+
set[UDSSubspace]
463467
Subspace names excluding structural attributes
464468
"""
465469
return self._node_subspaces
466470

467471
@property
468-
def edge_subspaces(self) -> set[str]:
472+
def edge_subspaces(self) -> set[UDSSubspace]:
469473
"""Set of subspaces used in edge annotations.
470474
471475
Returns
472476
-------
473-
set[str]
477+
set[UDSSubspace]
474478
Subspace names for edges
475479
"""
476480
return self._edge_subspaces
477481

478482
@property
479-
def subspaces(self) -> set[str]:
483+
def subspaces(self) -> set[UDSSubspace]:
480484
"""Set of all subspaces (node and edge).
481485
482486
Returns
483487
-------
484-
set[str]
488+
set[UDSSubspace]
485489
Union of node and edge subspaces
486490
"""
487491
return self.node_subspaces | self._edge_subspaces
488492

489-
def properties(self, subspace: str | None = None) -> set[str]:
493+
def properties(self, subspace: UDSSubspace | None = None) -> set[str]:
490494
"""Get properties for a subspace.
491495
492496
Parameters
@@ -501,7 +505,7 @@ def properties(self, subspace: str | None = None) -> set[str]:
501505
"""
502506
return self._metadata.properties(subspace)
503507

504-
def property_metadata(self, subspace: str,
508+
def property_metadata(self, subspace: UDSSubspace,
505509
prop: str) -> UDSPropertyMetadata:
506510
"""Get metadata for a specific property.
507511
@@ -650,11 +654,13 @@ def _process_node_data(self, data: dict[str, dict[str, NormalizedData | RawData]
650654

651655
# some attributes are not property subspaces and are thus excluded
652656
self._excluded_attributes = {'subpredof', 'subargof', 'headof', 'span', 'head'}
653-
self._node_subspaces = {ss for gid, nodedict
654-
in self._node_attributes.items()
655-
for nid, subspaces in nodedict.items()
656-
for ss in subspaces}
657-
self._node_subspaces = self._node_subspaces - self._excluded_attributes
657+
self._node_subspaces: set[UDSSubspace] = {
658+
cast(UDSSubspace, ss) for gid, nodedict
659+
in self._node_attributes.items()
660+
for nid, subspaces in nodedict.items()
661+
for ss in subspaces
662+
if ss not in self._excluded_attributes
663+
}
658664

659665
# initialize as nested defaultdict, will be frozen to regular dict later
660666
# the actual type is a nested defaultdict but we'll treat it as the final dict type
@@ -692,10 +698,12 @@ def _process_edge_data(self, data: dict[str, dict[str, NormalizedData | RawData]
692698
if '%%' in edge}
693699
for gid, attrs in data.items()}
694700

695-
self._edge_subspaces = {ss for gid, edgedict
696-
in self._edge_attributes.items()
697-
for eid, subspaces in edgedict.items()
698-
for ss in subspaces}
701+
self._edge_subspaces: set[UDSSubspace] = {
702+
cast(UDSSubspace, ss) for gid, edgedict
703+
in self._edge_attributes.items()
704+
for eid, subspaces in edgedict.items()
705+
for ss in subspaces
706+
}
699707

700708
# initialize as nested defaultdict, will be frozen to regular dict later
701709
# the actual type is a nested defaultdict but we'll treat it as the final dict type
@@ -820,7 +828,7 @@ class method must be:
820828
"""
821829
return cast('RawUDSAnnotation', super().from_json(jsonfile))
822830

823-
def annotators(self, subspace: str | None = None,
831+
def annotators(self, subspace: UDSSubspace | None = None,
824832
prop: str | None = None) -> set[str] | None:
825833
"""Get annotator IDs for a subspace and property.
826834

decomp/semantics/uds/corpus.py

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from logging import warn
2525
from os.path import basename, splitext
2626
from random import sample
27-
from typing import TextIO, TypeAlias, cast
27+
from typing import Literal, TextIO, TypeAlias, cast
2828
from zipfile import ZipFile
2929

3030
import requests
@@ -35,7 +35,7 @@
3535
from .annotation import NormalizedUDSAnnotation, RawUDSAnnotation, UDSAnnotation
3636
from .document import SentenceGraphDict, UDSDocument
3737
from .graph import EdgeAttributes, EdgeKey, NodeAttributes, UDSSentenceGraph
38-
from .metadata import UDSCorpusMetadata, UDSPropertyMetadata
38+
from .metadata import AnnotationMetadataDict, UDSCorpusMetadata, UDSPropertyMetadata
3939

4040

4141
Location: TypeAlias = str | TextIO
@@ -69,8 +69,8 @@ class UDSCorpus(PredPattCorpus):
6969

7070
UD_URL = 'https://github.com/UniversalDependencies/' +\
7171
'UD_English-EWT/archive/r1.2.zip'
72-
ANN_DIR = str(importlib.resources.files('decomp') / 'data')
73-
CACHE_DIR = str(importlib.resources.files('decomp') / 'data')
72+
ANN_DIR = str(importlib.resources.files('decomp') / 'data') + '/'
73+
CACHE_DIR = str(importlib.resources.files('decomp') / 'data') + '/'
7474

7575
def __init__(self,
7676
sentences: PredPattCorpus | None = None,
@@ -120,16 +120,15 @@ def __init__(self,
120120
self._sentences = {str(name): UDSSentenceGraph(g, str(name))
121121
for name, g in sentences.items()}
122122
self._graphs = self._sentences
123+
else:
124+
# When sentences is already a dict of UDSSentenceGraph objects
125+
self._sentences = sentences
126+
self._graphs = self._sentences
123127

124128
self._documents = documents or {}
125129

126-
if sentence_annotations:
127-
for ann in sentence_annotations:
128-
self.add_annotation(ann)
129-
130-
if document_annotations:
131-
for ann in document_annotations:
132-
self.add_annotation(document_annotation=ann)
130+
if sentence_annotations or document_annotations:
131+
self.add_annotation(sentence_annotations, document_annotations)
133132

134133
def _validate_arguments(self, sentences: PredPattCorpus | None, documents: dict[str, UDSDocument] | None,
135134
version: str, split: str | None, annotation_format: str) -> None:
@@ -497,11 +496,16 @@ def from_json(cls, sentences_jsonfile: Location,
497496
sent_ids, name)
498497
for name, d_json in documents_json['data'].items()}
499498

500-
corpus = cls(cast(PredPattCorpus | None, sentences), documents)
499+
corpus = cls(sentences, documents)
501500

502-
metadata_dict = {'sentence_metadata': sentences_json['metadata'],
503-
'document_metadata': documents_json['metadata']}
504-
metadata = UDSCorpusMetadata.from_dict(metadata_dict)
501+
metadata_dict = {
502+
'sentence_metadata': sentences_json['metadata'],
503+
'document_metadata': documents_json['metadata']
504+
}
505+
metadata = UDSCorpusMetadata.from_dict(cast(
506+
dict[Literal['sentence_metadata', 'document_metadata'], AnnotationMetadataDict],
507+
metadata_dict
508+
))
505509
corpus.add_corpus_metadata(metadata)
506510

507511
return corpus
@@ -516,8 +520,8 @@ def add_corpus_metadata(self, metadata: UDSCorpusMetadata) -> None:
516520
"""
517521
self._metadata += metadata
518522

519-
def add_annotation(self, sentence_annotation: UDSAnnotation | None = None,
520-
document_annotation: UDSAnnotation | None = None) -> None:
523+
def add_annotation(self, sentence_annotation: list[UDSAnnotation] | None = None,
524+
document_annotation: list[UDSAnnotation] | None = None) -> None:
521525
"""Add annotations to UDS sentence and document graphs
522526
523527
Parameters
@@ -528,10 +532,12 @@ def add_annotation(self, sentence_annotation: UDSAnnotation | None = None,
528532
the annotations to add to the document graphs in the corpus
529533
"""
530534
if sentence_annotation:
531-
self.add_sentence_annotation(sentence_annotation)
535+
for ann in sentence_annotation:
536+
self.add_sentence_annotation(ann)
532537

533538
if document_annotation:
534-
self.add_document_annotation(document_annotation)
539+
for ann in document_annotation:
540+
self.add_document_annotation(ann)
535541

536542
def add_sentence_annotation(self, annotation: UDSAnnotation) -> None:
537543
"""Add annotations to UDS sentence graphs

0 commit comments

Comments
 (0)