Skip to content
This repository was archived by the owner on Sep 13, 2022. It is now read-only.

Commit cea9c6a

Browse files
committed
ZPar v0.7 compatibility and blank sentence handling
- Modified the Makefile to work with the recently released ZPar v0.7 - Modified python wrappers to intercept blank lines and return empty strings. - Updated wrapper version number to to 0.7.
1 parent c8a238f commit cea9c6a

File tree

7 files changed

+173
-62
lines changed

7 files changed

+173
-62
lines changed

Makefile

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@ all: python-zpar
22

33
clean:
44
rm -rf /tmp/zpar
5-
rm -f /tmp/zpar.zip
5+
rm -f /tmp/zpar.tar.gz
66

7-
python-zpar: clean /tmp/zpar.zip
8-
unzip -q /tmp/zpar.zip -d /tmp
7+
python-zpar: clean /tmp/zpar.tar.gz
8+
tar -C /tmp/zpar -zxf /tmp/zpar.tar.gz --strip-components=1
99
cp src/zpar.lib.cpp /tmp/zpar/src/english
1010
cp src/Makefile.lib.zpar /tmp/zpar
1111
cp src/Makefile /tmp/zpar
@@ -14,7 +14,8 @@ python-zpar: clean /tmp/zpar.zip
1414
mkdir -p zpar/dist
1515
cp /tmp/zpar/dist/zpar.so zpar/dist/
1616

17-
/tmp/zpar.zip:
18-
wget -N http://sourceforge.net/projects/zpar/files/latest/zpar.zip -O /tmp/zpar.zip
17+
/tmp/zpar.tar.gz:
18+
wget -N http://sourceforge.net/projects/zpar/files/latest/zpar.tar.gz -O /tmp/zpar.tar.gz
1919
touch $@
20+
mkdir /tmp/zpar
2021

conda.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
package:
22
name: python-zpar
3-
version: "0.6.0"
3+
version: "0.7.0"
44

55
build:
66
number: {{environ.get('BINSTAR_BUILD', 1)}}

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def read(fname):
8282

8383
setup(
8484
name='python-zpar',
85-
version='0.6',
85+
version='0.7',
8686
description='A Wrapper around the ZPar statistical tagger/parser for English',
8787
maintainer='Nitin Madnani',
8888
maintainer_email='nmadnani@ets.org',

src/Makefile

Lines changed: 136 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,15 @@
2424
#
2525
#================================================================
2626

27+
# the generic tagger
28+
TAGGER_IMPL = collins
29+
30+
# the generic depparser
31+
DEPPARSER_IMPL = arceager
32+
33+
# the generic conparser
34+
CONPARSER_IMPL = srnew
35+
2736
# choose between agenda, agendachart etc ## NO SPACE AFTER NAME ###
2837
#
2938
# agenda: the single agenda method - reproduce paper
@@ -48,18 +57,33 @@ CHINESE_DEPPARSER_LABELED = true
4857
CHINESE_DEPLABELER_IMPL = naive
4958

5059
# currently support sr implementations
51-
CHINESE_CONPARSER_IMPL = jcad
60+
CHINESE_CONPARSER_IMPL = acl13
61+
CHINESE_CONPARSER_JOINT_OR_CASCADE = JOINT_CONPARSER
5262

5363
# currently support only agenda
5464
ENGLISH_TAGGER_IMPL = collins
5565

5666
# currently support eisner, covington, nivre, combined implementations
57-
ENGLISH_DEPPARSER_IMPL =arceager
67+
ENGLISH_DEPPARSER_IMPL = arceager
5868
ENGLISH_DEPPARSER_LABELED = true
5969
ENGLISH_DEPLABELER_IMPL = naive
70+
6071
# currently support sr implementations
6172
ENGLISH_CONPARSER_IMPL = muhua
6273

74+
# Spanish pos tagger
75+
SPANISH_TAGGER_IMPL = collins
76+
77+
# Spanish dependency parser
78+
SPANISH_DEPPARSER_IMPL = arceager
79+
SPANISH_DEPPARSER_LABELED = true
80+
SPANISH_DEPLABELER_IMPL = naive
81+
82+
# Spanish annotation. Supported: ES06_DEPENDENCIES, ES09_DEPENDENCIES
83+
SPANISH_ANNOTATION = ES09_DEPENDENCIES
84+
85+
#no Spanish constituency parser at the moment
86+
6387
#================================================================
6488
#
6589
# Debug mode or the run mode (empty)
@@ -78,6 +102,19 @@ DEBUG = -DNDEBUG
78102
BASE_DIR = .
79103
include Makefile.common
80104

105+
#================================================================
106+
#
107+
# cross platform configurations
108+
#
109+
#================================================================
110+
111+
ifeq ($(OS),Windows_NT)
112+
#use good old GNU mkdir instead of MSDOS mkdir on Windows
113+
MKDIR=gmkdir -p
114+
else
115+
MKDIR=mkdir -p
116+
endif
117+
81118
#================================================================
82119
#
83120
# compiler commands
@@ -100,37 +137,47 @@ LDFLAGS =
100137

101138
# the objects
102139
LINGUISTICS_OBJECTS = $(OBJECT_DIR)/linguistics/lemma.o $(OBJECT_DIR)/linguistics/conll.o
103-
OBJECTS = $(OBJECT_DIR)/reader.o $(OBJECT_DIR)/writer.o $(OBJECT_DIR)/options.o $(LINGUISTICS_OBJECTS)
140+
LEARNING_OBJECTS = $(OBJECT_DIR)/learning/dbn.o
141+
OBJECTS = $(OBJECT_DIR)/reader.o $(OBJECT_DIR)/writer.o $(OBJECT_DIR)/options.o $(LINGUISTICS_OBJECTS) $(LEARNING_OBJECTS)
104142

105143
$(OBJECT_DIR)/%.o: $(SRC_LIBS)/%.cpp $(SRC_INCLUDES)/%.h
106-
mkdir -p $(OBJECT_DIR)
107-
mkdir -p $(OBJECT_DIR)/linguistics
144+
$(MKDIR) $(OBJECT_DIR)
145+
$(MKDIR) $(OBJECT_DIR)/linguistics
146+
$(MKDIR) $(OBJECT_DIR)/learning
108147
$(CXX) $(CXXFLAGS) -c $< -o $@
109148

110149
all: zpar
111150

112151
# the directories
113152
$(OBJECT_DIR):
114-
mkdir -p $(OBJECT_DIR)
153+
$(MKDIR) $(OBJECT_DIR)
115154
$(DIST_DIR):
116-
mkdir -p $(DIST_DIR)
155+
$(MKDIR) $(DIST_DIR)
117156

118157
# tagger
119158
SRC_TAGGER = $(SRC_CHINESE)/tagger
120159
DIST_TAGGER = $(DIST_DIR)/chinese.postagger
121160
OBJECT_TAGGER = $(OBJECT_DIR)/chinese.postagger
122161
$(DIST_TAGGER):
123-
mkdir $(DIST_TAGGER)
162+
$(MKDIR) $(DIST_TAGGER)
124163
$(OBJECT_TAGGER):
125-
mkdir $(OBJECT_TAGGER)
164+
$(MKDIR) $(OBJECT_TAGGER)
126165

127166
SRC_ENGLISH_TAGGER = $(SRC_COMMON)/tagger
128167
DIST_ENGLISH_TAGGER = $(DIST_DIR)/english.postagger
129168
OBJECT_ENGLISH_TAGGER = $(OBJECT_DIR)/english.postagger
130169
$(DIST_ENGLISH_TAGGER):
131-
mkdir $(DIST_ENGLISH_TAGGER)
170+
$(MKDIR) $(DIST_ENGLISH_TAGGER)
132171
$(OBJECT_ENGLISH_TAGGER):
133-
mkdir $(OBJECT_ENGLISH_TAGGER)
172+
$(MKDIR) $(OBJECT_ENGLISH_TAGGER)
173+
174+
SRC_SPANISH_TAGGER = $(SRC_COMMON)/tagger
175+
DIST_SPANISH_TAGGER = $(DIST_DIR)/spanish.postagger
176+
OBJECT_SPANISH_TAGGER = $(OBJECT_DIR)/spanish.postagger
177+
$(DIST_SPANISH_TAGGER):
178+
$(MKDIR) $(DIST_SPANISH_TAGGER)
179+
$(OBJECT_SPANISH_TAGGER):
180+
$(MKDIR) $(OBJECT_SPANISH_TAGGER)
134181

135182
# depparser
136183
SRC_COMMON_DEPPARSER = $(SRC_COMMON)/depparser
@@ -144,6 +191,8 @@ DIST_DEPPARSER = $(DIST_DIR)/chinese.depparser
144191
OBJECT_DEPPARSER = $(OBJECT_DIR)/chinese.depparser
145192
DIST_ENGLISH_DEPPARSER = $(DIST_DIR)/english.depparser
146193
OBJECT_ENGLISH_DEPPARSER = $(OBJECT_DIR)/english.depparser
194+
DIST_SPANISH_DEPPARSER = $(DIST_DIR)/spanish.depparser
195+
OBJECT_SPANISH_DEPPARSER = $(OBJECT_DIR)/spanish.depparser
147196

148197
# deplabeler
149198
SRC_COMMON_DEPLABELER = $(SRC_COMMON)/deplabeler
@@ -153,14 +202,21 @@ OBJECT_DEPLABELER = $(OBJECT_DIR)/chinese.deplabeler
153202
SRC_ENGLISH_DEPLABELER = $(SRC_COMMON_DEPLABELER)
154203
DIST_ENGLISH_DEPLABELER = $(DIST_DIR)/english.deplabeler
155204
OBJECT_ENGLISH_DEPLABELER = $(OBJECT_DIR)/english.deplabeler
205+
SRC_SPANISH_DEPLABELER = $(SRC_COMMON_DEPLABELER)
206+
DIST_SPANISH_DEPLABELER = $(DIST_DIR)/spanish.deplabeler
207+
OBJECT_SPANISH_DEPLABELER = $(OBJECT_DIR)/spanish.deplabeler
156208

157209
# conparser
158210
SRC_COMMON_CONPARSER = $(SRC_COMMON)/conparser
159211
SRC_CHINESE_CONPARSER = $(SRC_COMMON_CONPARSER)
160212
ifeq ($(CHINESE_CONPARSER_IMPL), jcad)
161213
SRC_CHINESE_CONPARSER = $(SRC_CHINESE)/conparser
162214
else
163-
SRC_CHINESE_CONPARSER = $(SRC_COMMON_CONPARSER)
215+
ifeq ($(CHINESE_CONPARSER_IMPL), acl13)
216+
SRC_CHINESE_CONPARSER = $(SRC_CHINESE)/conparser
217+
else
218+
SRC_CHINESE_CONPARSER = $(SRC_COMMON_CONPARSER)
219+
endif
164220
endif
165221
SRC_ENGLISH_CONPARSER = $(SRC_COMMON_CONPARSER)
166222
DIST_CONPARSER = $(DIST_DIR)/chinese.conparser
@@ -174,51 +230,88 @@ OBJECT_ENGLISH_CONPARSER = $(OBJECT_DIR)/english.conparser
174230
#
175231
#----------------------------------------------------------------
176232

177-
include Makefile.zpar
178-
include Makefile.zpar.en
179-
include Makefile.zpar.ge
180-
include Makefile.lib.zpar
181233

182-
#----------------------------------------------------------------
183-
#
184-
# The sentence boundary detector
185-
#
186-
#----------------------------------------------------------------
234+
ifeq ($(CHINESE_CONPARSER_IMPL), jcad)
235+
OBJ_CHINESE_CONSTITUENT = $(OBJECT_CONPARSER)/constituent.o $(OBJECT_CONPARSER)/jointconstituent.o
236+
else
237+
ifeq ($(CHINESE_CONPARSER_IMPL), acl13)
238+
OBJ_CHINESE_CONSTITUENT = $(OBJECT_CONPARSER)/constituent.o $(OBJECT_CONPARSER)/jointconstituent.o
239+
else
240+
OBJ_CHINESE_CONSTITUENT = $(OBJECT_CONPARSER)/constituent.o
241+
endif
242+
endif
187243

188-
include Makefile.doc2snt
244+
$(DIST_CONPARSER):
245+
$(MKDIR) $(DIST_CONPARSER)
246+
$(OBJECT_CONPARSER):
247+
$(MKDIR) $(OBJECT_CONPARSER)
189248

190-
#----------------------------------------------------------------
191-
#
192-
# The segmentor
193-
#
194-
#----------------------------------------------------------------
249+
$(DIST_DEPLABELER):
250+
$(MKDIR) $(DIST_DEPLABELER)
251+
$(OBJECT_DEPLABELER):
252+
$(MKDIR) $(OBJECT_DEPLABELER)
195253

196-
include Makefile.segmentor
254+
# the flags for train
255+
ifeq ($(CHINESE_TAGGER_IMPL), segmented) # if segmented
256+
TAGGER_TRAIN_FLAGS = -DSEGMENTED
257+
TAGGER_TEST_FLAGS = -DSEGMENTED
258+
else
259+
ifeq ($(CHINESE_TAGGER_IMPL), bidirectional) # else if bidirectional
260+
TAGGER_TRAIN_FLAGS = -DSEGMENTED -DAUTO
261+
TAGGER_TEST_FLAGS = -DSEGMENTED
262+
endif
263+
endif
197264

198-
#----------------------------------------------------------------
199-
#
200-
# The pos taggers (Chinese and English)
201-
#
202-
#----------------------------------------------------------------
203265

204-
include Makefile.postagger
266+
ifeq ($(CHINESE_DEPPARSER_LABELED), true)
267+
CHINESE_DEPPARSER_D = -DLABELED
268+
endif
205269

206-
#----------------------------------------------------------------
207-
#
208-
# The depparsers (Chinese and English)
209-
#
210-
#----------------------------------------------------------------
270+
ifeq ($(ENGLISH_DEPPARSER_LABELED), true)
271+
ENGLISH_DEPPARSER_D = -DLABELED
272+
endif
273+
274+
ifeq ($(CHINESE_DEPPARSER_IMPL), combined)
275+
CHINESE_DEPPARSER_D := $(CHINESE_DEPPARSER_D) -DCOMBINED
276+
CHINESE_DEPPARSER_IMPL = nivre
277+
endif
278+
279+
ifeq ($(ENGLISH_DEPPARSER_IMPL), combined)
280+
ENGLISH_DEPPARSER_D := $(ENGLISH_DEPPARSER_D) -DCOMBINED
281+
ENGLISH_DEPPARSER_IMPL = nivre
282+
endif
283+
284+
#====================================================
285+
286+
$(DIST_DEPPARSER):
287+
$(MKDIR) $(DIST_DEPPARSER)
288+
$(OBJECT_DEPPARSER):
289+
$(MKDIR) $(OBJECT_DEPPARSER)
211290

212-
include Makefile.depparser
213-
include Makefile.deplabeler
291+
SRC_SEGMENTOR = $(SRC_CHINESE)/segmentor
292+
DIST_SEGMENTOR = $(DIST_DIR)/segmentor
293+
OBJECT_SEGMENTOR = $(OBJECT_DIR)/segmentor
294+
$(DIST_SEGMENTOR):
295+
$(MKDIR) $(DIST_SEGMENTOR)
296+
$(OBJECT_SEGMENTOR):
297+
$(MKDIR) $(OBJECT_SEGMENTOR)
298+
299+
include Makefile.zpar.zh
300+
include Makefile.zpar.en
301+
include Makefile.zpar.ge
302+
include Makefile.zpar.es
303+
include Makefile.zpar.mvt
304+
include Makefile.lib.zpar
305+
306+
zpar: zpar.ge
214307

215308
#----------------------------------------------------------------
216309
#
217-
# The conparser
310+
# The sentence boundary detector
218311
#
219312
#----------------------------------------------------------------
220313

221-
include Makefile.conparser
314+
include Makefile.doc2snt
222315

223316
#----------------------------------------------------------------
224317
#

zpar/DepParser.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,16 @@ def __init__(self, modelpath, libptr):
3131
raise OSError('Cannot find dependency parser model at {}\n'.format(modelpath))
3232

3333
def dep_parse_sentence(self, sentence, tokenize=True):
34-
zpar_compatible_sentence = sentence.strip() + "\n "
35-
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
36-
parsed_sent = self._dep_parse_sentence(zpar_compatible_sentence, tokenize)
37-
return parsed_sent.decode('utf-8')
34+
if not sentence.strip():
35+
# return empty string if the input is empty
36+
ans = ""
37+
else:
38+
zpar_compatible_sentence = sentence.strip() + "\n "
39+
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
40+
parsed_sent = self._dep_parse_sentence(zpar_compatible_sentence, tokenize)
41+
ans = parsed_sent.decode('utf-8')
42+
43+
return ans
3844

3945
def dep_parse_file(self, inputfile, outputfile, tokenize=True):
4046
if os.path.exists(inputfile):

zpar/Parser.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,15 @@ def __init__(self, modelpath, libptr):
3131
raise OSError('Cannot find parser model at {}\n'.format(modelpath))
3232

3333
def parse_sentence(self, sentence, tokenize=True):
34-
zpar_compatible_sentence = sentence.strip() + "\n "
35-
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
36-
parsed_sent = self._parse_sentence(zpar_compatible_sentence, tokenize)
37-
return parsed_sent.decode('utf-8')
34+
if not sentence.strip():
35+
# return empty string if the input is empty
36+
ans = ""
37+
else:
38+
zpar_compatible_sentence = sentence.strip() + "\n "
39+
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
40+
parsed_sent = self._parse_sentence(zpar_compatible_sentence, tokenize)
41+
ans = parsed_sent.decode('utf-8')
42+
return ans
3843

3944
def parse_file(self, inputfile, outputfile, tokenize=True):
4045
if os.path.exists(inputfile):

zpar/Tagger.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,16 @@ def __init__(self, modelpath, libptr):
3030
raise OSError('Cannot find tagger model at {}\n'.format(modelpath))
3131

3232
def tag_sentence(self, sentence, tokenize=True):
33-
zpar_compatible_sentence = sentence.strip() + "\n "
34-
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
35-
tagged_sent = self._tag_sentence(zpar_compatible_sentence, tokenize)
36-
return tagged_sent.decode('utf-8')
33+
if not sentence.strip():
34+
# return empty string if the input is empty
35+
ans = ""
36+
else:
37+
zpar_compatible_sentence = sentence.strip() + "\n "
38+
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
39+
tagged_sent = self._tag_sentence(zpar_compatible_sentence, tokenize)
40+
ans = tagged_sent.decode('utf-8')
41+
42+
return ans
3743

3844
def tag_file(self, inputfile, outputfile, tokenize=True):
3945
if os.path.exists(inputfile):

0 commit comments

Comments
 (0)