@@ -48,29 +48,15 @@ class BaseParser
4848 REFERENCE = "&(?:#{ NAME } ;|#\\ d+;|#x[0-9a-fA-F]+;)"
4949 REFERENCE_RE = /#{ REFERENCE } /
5050
51- DOCTYPE_START = /\A \s *<!DOCTYPE\s /um
52- DOCTYPE_END = /\A \s *\] \s *>/um
5351 ATTRIBUTE_PATTERN = /\s *(#{ QNAME_STR } )\s *=\s *(["'])(.*?)\4 /um
54- COMMENT_START = /\A <!--/u
55- COMMENT_PATTERN = /<!--(.*?)-->/um
56- CDATA_START = /\A <!\[ CDATA\[ /u
57- CDATA_END = /\A \s *\] \s *>/um
58- CDATA_PATTERN = /<!\[ CDATA\[ (.*?)\] \] >/um
59- XMLDECL_START = /\A <\? xml\s /u ;
60- XMLDECL_PATTERN = /<\? xml\s +(.*?)\? >/um
61- INSTRUCTION_START = /\A <\? /u
62- INSTRUCTION_PATTERN = /<\? #{ NAME } (\s +.*?)?\? >/um
63- TAG_MATCH = /\A <((?>#{ QNAME_STR } ))/um
64- CLOSE_MATCH = /\A \s *<\/ (#{ QNAME_STR } )\s *>/um
52+ INSTRUCTION_PATTERN = /#{ NAME } (\s +.*?)?\? >/um
53+ TAG_MATCH = /((?>#{ QNAME_STR } ))/um
54+ CLOSE_MATCH = /(#{ QNAME_STR } )\s *>/um
6555
6656 VERSION = /\b version\s *=\s *["'](.*?)['"]/um
6757 ENCODING = /\b encoding\s *=\s *["'](.*?)['"]/um
6858 STANDALONE = /\b standalone\s *=\s *["'](.*?)['"]/um
6959
70- ENTITY_START = /\A \s *<!ENTITY/
71- ELEMENTDECL_START = /\A \s *<!ELEMENT/um
72- ELEMENTDECL_PATTERN = /\A \s *(<!ELEMENT.*?)>/um
73- SYSTEMENTITY = /\A \s *(%.*?;)\s *$/um
7460 ENUMERATION = "\\ (\\ s*#{ NMTOKEN } (?:\\ s*\\ |\\ s*#{ NMTOKEN } )*\\ s*\\ )"
7561 NOTATIONTYPE = "NOTATION\\ s+\\ (\\ s*#{ NAME } (?:\\ s*\\ |\\ s*#{ NAME } )*\\ s*\\ )"
7662 ENUMERATEDTYPE = "(?:(?:#{ NOTATIONTYPE } )|(?:#{ ENUMERATION } ))"
@@ -79,10 +65,7 @@ class BaseParser
7965 DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\ s+)?#{ ATTVALUE } ))"
8066 ATTDEF = "\\ s+#{ NAME } \\ s+#{ ATTTYPE } \\ s+#{ DEFAULTDECL } "
8167 ATTDEF_RE = /#{ ATTDEF } /
82- ATTLISTDECL_START = /\A \s *<!ATTLIST/um
83- ATTLISTDECL_PATTERN = /\A \s *<!ATTLIST\s +#{ NAME } (?:#{ ATTDEF } )*\s *>/um
84-
85- TEXT_PATTERN = /\A ([^<]*)/um
68+ ATTLISTDECL_PATTERN = /\s +#{ NAME } (?:#{ ATTDEF } )*\s *>/um
8669
8770 # Entity constants
8871 PUBIDCHAR = "\x20 \x0D \x0A a-zA-Z0-9\\ -()+,./:=?;!*@$_%#"
@@ -94,11 +77,10 @@ class BaseParser
9477 ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{ PEREFERENCE } |#{ REFERENCE } )*")|(?:'([^%&']|#{ PEREFERENCE } |#{ REFERENCE } )*'))}
9578 PEDEF = "(?:#{ ENTITYVALUE } |#{ EXTERNALID } )"
9679 ENTITYDEF = "(?:#{ ENTITYVALUE } |(?:#{ EXTERNALID } (#{ NDATADECL } )?))"
97- PEDECL = "<!ENTITY \\ s+(%)\\ s+#{ NAME } \\ s+#{ PEDEF } \\ s*>"
98- GEDECL = "<!ENTITY \\ s+#{ NAME } \\ s+#{ ENTITYDEF } \\ s*>"
99- ENTITYDECL = /\s * (?:#{ GEDECL } )|\s * (?:#{ PEDECL } )/um
80+ PEDECL = "\\ s+(%)\\ s+#{ NAME } \\ s+#{ PEDEF } \\ s*>"
81+ GEDECL = "\\ s+#{ NAME } \\ s+#{ ENTITYDEF } \\ s*>"
82+ ENTITYDECL = /(?:#{ GEDECL } )|(?:#{ PEDECL } )/um
10083
101- NOTATIONDECL_START = /\A \s *<!NOTATION/um
10284 EXTERNAL_ID_PUBLIC = /\A \s *PUBLIC\s +#{ PUBIDLITERAL } \s +#{ SYSTEMLITERAL } \s */um
10385 EXTERNAL_ID_SYSTEM = /\A \s *SYSTEM\s +#{ SYSTEMLITERAL } \s */um
10486 PUBLIC_ID = /\A \s *PUBLIC\s +#{ PUBIDLITERAL } \s */um
@@ -198,65 +180,67 @@ def pull_event
198180 #STDERR.puts @source.encoding
199181 #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
200182 if @document_status == nil
201- word = @source . match ( /\A ((?:\s +)|(?:<[^>]*>))/um )
202- word = word [ 1 ] unless word . nil?
203- #STDERR.puts "WORD = #{word.inspect}"
204- case word
205- when COMMENT_START
206- return [ :comment , @source . match ( COMMENT_PATTERN , true ) [ 1 ] ]
207- when XMLDECL_START
208- #STDERR.puts "XMLDECL"
209- results = @source . match ( XMLDECL_PATTERN , true ) [ 1 ]
210- version = VERSION . match ( results )
211- version = version [ 1 ] unless version . nil?
212- encoding = ENCODING . match ( results )
213- encoding = encoding [ 1 ] unless encoding . nil?
214- if need_source_encoding_update? ( encoding )
215- @source . encoding = encoding
216- end
217- if encoding . nil? and /\A UTF-16(?:BE|LE)\z /i =~ @source . encoding
218- encoding = "UTF-16"
219- end
220- standalone = STANDALONE . match ( results )
221- standalone = standalone [ 1 ] unless standalone . nil?
222- return [ :xmldecl , version , encoding , standalone ]
223- when INSTRUCTION_START
224- return process_instruction
225- when DOCTYPE_START
226- base_error_message = "Malformed DOCTYPE"
227- @source . match ( DOCTYPE_START , true )
228- @nsstack . unshift ( curr_ns = Set . new )
229- name = parse_name ( base_error_message )
230- if @source . match ( /\A \s *\[ /um , true )
231- id = [ nil , nil , nil ]
232- @document_status = :in_doctype
233- elsif @source . match ( /\A \s *>/um , true )
234- id = [ nil , nil , nil ]
235- @document_status = :after_doctype
236- else
237- id = parse_id ( base_error_message ,
238- accept_external_id : true ,
239- accept_public_id : false )
240- if id [ 0 ] == "SYSTEM"
241- # For backward compatibility
242- id [ 1 ] , id [ 2 ] = id [ 2 ] , nil
183+ @source . read
184+ if @source . match ( "<?" , true , false )
185+ if results = @source . match ( /xml\s +(.*?)\? >/um , true , false )
186+ results = results [ 1 ]
187+ version = VERSION . match ( results )
188+ version = version [ 1 ] unless version . nil?
189+ encoding = ENCODING . match ( results )
190+ encoding = encoding [ 1 ] unless encoding . nil?
191+ if need_source_encoding_update? ( encoding )
192+ @source . encoding = encoding
243193 end
244- if @source . match ( /\A \s *\[ /um , true )
194+ if encoding . nil? and /\A UTF-16(?:BE|LE)\z /i =~ @source . encoding
195+ encoding = "UTF-16"
196+ end
197+ standalone = STANDALONE . match ( results )
198+ standalone = standalone [ 1 ] unless standalone . nil?
199+ return [ :xmldecl , version , encoding , standalone ]
200+ else # instruction
201+ return process_instruction
202+ end
203+ elsif @source . match ( "<!" , true , false )
204+ if @source . match ( "--" , true , false )
205+ return [ :comment , @source . match ( /(.*?)-->/um , true ) [ 1 ] ]
206+ elsif @source . match ( /DOCTYPE\s /um , true , false )
207+ base_error_message = "Malformed DOCTYPE"
208+ @nsstack . unshift ( curr_ns = Set . new )
209+ name = parse_name ( base_error_message )
210+ if @source . match ( /\s *\[ /um , true )
211+ id = [ nil , nil , nil ]
245212 @document_status = :in_doctype
246- elsif @source . match ( /\A \s *>/um , true )
213+ elsif @source . match ( /\s *>/um , true )
214+ id = [ nil , nil , nil ]
247215 @document_status = :after_doctype
248216 else
249- message = "#{ base_error_message } : garbage after external ID"
250- raise REXML ::ParseException . new ( message , @source )
217+ id = parse_id ( base_error_message ,
218+ accept_external_id : true ,
219+ accept_public_id : false )
220+ if id [ 0 ] == "SYSTEM"
221+ # For backward compatibility
222+ id [ 1 ] , id [ 2 ] = id [ 2 ] , nil
223+ end
224+ if @source . match ( /\s *\[ /um , true )
225+ @document_status = :in_doctype
226+ elsif @source . match ( /\s *>/um , true )
227+ @document_status = :after_doctype
228+ else
229+ message = "#{ base_error_message } : garbage after external ID"
230+ raise REXML ::ParseException . new ( message , @source )
231+ end
251232 end
233+ args = [ :start_doctype , name , *id ]
234+ if @document_status == :after_doctype
235+ @source . match ( /\s */um , true )
236+ @stack << [ :end_doctype ]
237+ end
238+ return args
239+ else
240+ message = "Invalid XML"
241+ raise REXML ::ParseException . new ( message , @source )
252242 end
253- args = [ :start_doctype , name , *id ]
254- if @document_status == :after_doctype
255- @source . match ( /\A \s */um , true )
256- @stack << [ :end_doctype ]
257- end
258- return args
259- when /\A \s +/
243+ elsif @source . match ( /\s +/ , false , false )
260244 else
261245 @document_status = :after_doctype
262246 if @source . encoding == "UTF-8"
@@ -265,16 +249,13 @@ def pull_event
265249 end
266250 end
267251 if @document_status == :in_doctype
268- md = @source . match ( /\A \s *(.*?>)/um )
269- case md [ 1 ]
270- when SYSTEMENTITY
271- match = @source . match ( SYSTEMENTITY , true ) [ 1 ]
272- return [ :externalentity , match ]
273-
274- when ELEMENTDECL_START
275- return [ :elementdecl , @source . match ( ELEMENTDECL_PATTERN , true ) [ 1 ] ]
276-
277- when ENTITY_START
252+ @source . read
253+ @source . match ( /\s */um , true , false ) # skip spaces
254+ if match = @source . match ( /(%.*?;)\s *$/um , true , false )
255+ return [ :externalentity , match [ 1 ] ]
256+ elsif match = @source . match ( /(<!ELEMENT.*?)>/um , true , false )
257+ return [ :elementdecl , match [ 1 ] ]
258+ elsif @source . match ( "<!ENTITY" , true , false )
278259 match = [ :entitydecl , *@source . match ( ENTITYDECL , true ) . captures . compact ]
279260 ref = false
280261 if match [ 1 ] == '%'
@@ -300,7 +281,7 @@ def pull_event
300281 end
301282 match << '%' if ref
302283 return match
303- when ATTLISTDECL_START
284+ elsif @source . match ( "<!ATTLIST" , true , false )
304285 md = @source . match ( ATTLISTDECL_PATTERN , true )
305286 raise REXML ::ParseException . new ( "Bad ATTLIST declaration!" , @source ) if md . nil?
306287 element = md [ 1 ]
@@ -320,42 +301,41 @@ def pull_event
320301 end
321302 end
322303 return [ :attlistdecl , element , pairs , contents ]
323- when NOTATIONDECL_START
304+ elsif @source . match ( "<!NOTATION" , true , false )
324305 base_error_message = "Malformed notation declaration"
325- unless @source . match ( /\A \s *<!NOTATION \ s +/um , true )
326- if @source . match ( /\A \s *<!NOTATION \ s *>/um )
306+ unless @source . match ( /\s +/um , true )
307+ if @source . match ( /\s *>/um )
327308 message = "#{ base_error_message } : name is missing"
328309 else
329310 message = "#{ base_error_message } : invalid declaration name"
330311 end
312+ @source . string = " <!NOTATION" + @source . buffer
331313 raise REXML ::ParseException . new ( message , @source )
332314 end
333315 name = parse_name ( base_error_message )
334316 id = parse_id ( base_error_message ,
335317 accept_external_id : true ,
336318 accept_public_id : true )
337- unless @source . match ( /\A \ s *>/um , true )
319+ unless @source . match ( /\s *>/um , true )
338320 message = "#{ base_error_message } : garbage before end >"
339321 raise REXML ::ParseException . new ( message , @source )
340322 end
341323 return [ :notationdecl , name , *id ]
342- when DOCTYPE_END
324+ elsif @source . match ( / \] \s *>/um , true , false )
343325 @document_status = :after_doctype
344- @source . match ( DOCTYPE_END , true )
345326 return [ :end_doctype ]
346327 end
347328 end
348329 if @document_status == :after_doctype
349- @source . match ( /\A \ s */um , true )
330+ @source . match ( /\s */um , true )
350331 end
351332 begin
352333 next_data = @source . buffer
353334 if next_data . size < 2
354335 @source . read
355- next_data = @source . buffer
356336 end
357- if next_data [ 0 ] == ?<
358- if next_data [ 1 ] == ?/
337+ if @source . match ( "<" , true , false )
338+ if @source . match ( "/" , true , false )
359339 @nsstack . shift
360340 last_tag = @tags . pop
361341 md = @source . match ( CLOSE_MATCH , true )
@@ -366,15 +346,16 @@ def pull_event
366346 if md . nil? or last_tag != md [ 1 ]
367347 message = "Missing end tag for '#{ last_tag } '"
368348 message << " (got '#{ md [ 1 ] } ')" if md
349+ @source . string = "</" + @source . buffer if md . nil?
369350 raise REXML ::ParseException . new ( message , @source )
370351 end
371352 return [ :end_element , last_tag ]
372- elsif next_data [ 1 ] == ?!
373- md = @source . match ( /\A ( \s * [^>]*>)/um )
353+ elsif @source . match ( "!" , true , false )
354+ md = @source . match ( /( [^>]*>)/um )
374355 #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
375356 raise REXML ::ParseException . new ( "Malformed node" , @source ) unless md
376- if md [ 0 ] [ 2 ] == ?-
377- md = @source . match ( COMMENT_PATTERN , true )
357+ if md [ 0 ] [ 0 ] == ?-
358+ md = @source . match ( /--(.*?)-->/um , true )
378359
379360 case md [ 1 ]
380361 when /--/ , /-\z /
@@ -383,17 +364,18 @@ def pull_event
383364
384365 return [ :comment , md [ 1 ] ] if md
385366 else
386- md = @source . match ( CDATA_PATTERN , true )
367+ md = @source . match ( / \[ CDATA \[ (.*?) \] \] >/um , true )
387368 return [ :cdata , md [ 1 ] ] if md
388369 end
389370 raise REXML ::ParseException . new ( "Declarations can only occur " +
390371 "in the doctype declaration." , @source )
391- elsif next_data [ 1 ] == ??
372+ elsif @source . match ( "?" , true , false )
392373 return process_instruction
393374 else
394375 # Get the next tag
395376 md = @source . match ( TAG_MATCH , true )
396377 unless md
378+ @source . string = "<" + @source . buffer
397379 raise REXML ::ParseException . new ( "malformed XML: missing tag start" , @source )
398380 end
399381 tag = md [ 1 ]
@@ -418,7 +400,7 @@ def pull_event
418400 return [ :start_element , tag , attributes ]
419401 end
420402 else
421- md = @source . match ( TEXT_PATTERN , true )
403+ md = @source . match ( /([^<]*)/um , true )
422404 text = md [ 1 ]
423405 return [ :text , text ]
424406 end
@@ -579,6 +561,7 @@ def process_instruction
579561 match_data = @source . match ( INSTRUCTION_PATTERN , true )
580562 unless match_data
581563 message = "Invalid processing instruction node"
564+ @source . string = "<?" + @source . buffer
582565 raise REXML ::ParseException . new ( message , @source )
583566 end
584567 [ :processing_instruction , match_data [ 1 ] , match_data [ 2 ] ]
0 commit comments