Class: Oga::XML::Parser

Inherits:
LL::Driver
  • Object
show all
Defined in:
lib/oga/xml/parser.rb

Overview

DOM parser for both XML and HTML.

This parser does not produce a dedicated AST, instead it emits XML nodes directly. Basic usage of this parser is as following:

parser   = Oga::XML::Parser.new('<foo></foo>')
document = parser.parse

To enable HTML parsing you’d use the following instead:

parser   = Oga::XML::Parser.new('<foo></foo>', :html => true)
document = parser.parse

In both cases you can use either a String or an IO as the parser input. IO instances will result in lower memory overhead, especially when parsing large files.

Direct Known Subclasses

HTML::Parser, PullParser, SaxParser

Constant Summary collapse

CONFIG =
LL::DriverConfig.new
TOKEN_ERROR_MAPPING =

Hash mapping token types and dedicated error labels.

Returns:

  • (Hash)
{
  :T_STRING         => 'string',
  :T_TEXT           => 'text',
  :T_DOCTYPE_START  => 'doctype start',
  :T_DOCTYPE_END    => 'doctype closing tag',
  :T_DOCTYPE_TYPE   => 'doctype type',
  :T_DOCTYPE_NAME   => 'doctype name',
  :T_DOCTYPE_INLINE => 'inline doctype rules',
  :T_CDATA          => 'CDATA',
  :T_COMMENT        => 'comment',
  :T_ELEM_START     => 'element start',
  :T_ELEM_NAME      => 'element name',
  :T_ELEM_NS        => 'element namespace',
  :T_ELEM_END       => 'element closing tag',
  :T_ATTR           => 'attribute',
  :T_ATTR_NS        => 'attribute namespace',
  :T_XML_DECL_START => 'XML declaration start',
  :T_XML_DECL_END   => 'XML declaration end',
  :T_PROC_INS_START => 'processing-instruction start',
  :T_PROC_INS_NAME  => 'processing-instruction name',
  :T_PROC_INS_END   => 'processing-instruction closing tag',
  -1                => 'end of input'
}

Instance Method Summary collapse

Constructor Details

#initialize(data, options = {}) ⇒ Parser

Returns a new instance of Parser

Parameters:

  • data (String|IO)

    The input to parse.

  • options (Hash) (defaults to: {})

See Also:

  • Oga::XML::Parser.[Oga[Oga::XML[Oga::XML::Lexer[Oga::XML::Lexer#initialize]


212
213
214
215
216
217
# File 'lib/oga/xml/parser.rb', line 212

def initialize(data, options = {})
  @data  = data
  @lexer = Lexer.new(data, options)
  @line = 1
  @lexer.reset_native
end

Instance Method Details

#_rule_0(val) ⇒ Object



362
363
364
# File 'lib/oga/xml/parser.rb', line 362

def _rule_0(val)
   on_document(val[0]) 
end

#_rule_1(val) ⇒ Object



366
367
368
# File 'lib/oga/xml/parser.rb', line 366

def _rule_1(val)
  val[0]
end

#_rule_10(val) ⇒ Object



410
411
412
# File 'lib/oga/xml/parser.rb', line 410

def _rule_10(val)
   val[0].inject(:+) 
end

#_rule_11(val) ⇒ Object



414
415
416
# File 'lib/oga/xml/parser.rb', line 414

def _rule_11(val)
   on_cdata(val[1]) 
end

#_rule_12(val) ⇒ Object



418
419
420
# File 'lib/oga/xml/parser.rb', line 418

def _rule_12(val)
   val[0] + val[1] 
end

#_rule_13(val) ⇒ Object



422
423
424
# File 'lib/oga/xml/parser.rb', line 422

def _rule_13(val)
   '' 
end

#_rule_14(val) ⇒ Object



426
427
428
# File 'lib/oga/xml/parser.rb', line 426

def _rule_14(val)
   on_comment(val[1]) 
end

#_rule_15(val) ⇒ Object



430
431
432
# File 'lib/oga/xml/parser.rb', line 430

def _rule_15(val)
   val[0] + val[1] 
end

#_rule_16(val) ⇒ Object



434
435
436
# File 'lib/oga/xml/parser.rb', line 434

def _rule_16(val)
   '' 
end

#_rule_17(val) ⇒ Object



438
439
440
441
442
# File 'lib/oga/xml/parser.rb', line 438

def _rule_17(val)
  
    on_proc_ins(val[1], val[2])
  
end

#_rule_18(val) ⇒ Object



444
445
446
# File 'lib/oga/xml/parser.rb', line 444

def _rule_18(val)
   val[0] + val[1] 
end

#_rule_19(val) ⇒ Object



448
449
450
# File 'lib/oga/xml/parser.rb', line 448

def _rule_19(val)
   '' 
end

#_rule_2(val) ⇒ Object



370
371
372
# File 'lib/oga/xml/parser.rb', line 370

def _rule_2(val)
  val[0]
end

#_rule_20(val) ⇒ Object



452
453
454
# File 'lib/oga/xml/parser.rb', line 452

def _rule_20(val)
   [nil, val[0]] 
end

#_rule_21(val) ⇒ Object



456
457
458
# File 'lib/oga/xml/parser.rb', line 456

def _rule_21(val)
   val 
end

#_rule_22(val) ⇒ Object



460
461
462
463
464
# File 'lib/oga/xml/parser.rb', line 460

def _rule_22(val)
  
    on_element(val[0][0], val[0][1], val[1])
  
end

#_rule_23(val) ⇒ Object



466
467
468
469
470
471
472
473
474
# File 'lib/oga/xml/parser.rb', line 466

def _rule_23(val)
  
    if val[0]
      on_element_children(val[0], val[1])
    end

    after_element(val[0])
  
end

#_rule_24(val) ⇒ Object



476
477
478
# File 'lib/oga/xml/parser.rb', line 476

def _rule_24(val)
   on_attributes(val[0]) 
end

#_rule_25(val) ⇒ Object



480
481
482
# File 'lib/oga/xml/parser.rb', line 480

def _rule_25(val)
   on_attribute(val[1], val[0], val[2]) 
end

#_rule_26(val) ⇒ Object



484
485
486
# File 'lib/oga/xml/parser.rb', line 484

def _rule_26(val)
   on_attribute(val[0], nil, val[1]) 
end

#_rule_27(val) ⇒ Object



488
489
490
# File 'lib/oga/xml/parser.rb', line 488

def _rule_27(val)
   on_xml_decl(val[1]) 
end

#_rule_28(val) ⇒ Object



492
493
494
495
496
497
498
# File 'lib/oga/xml/parser.rb', line 492

def _rule_28(val)
  
    text = val[1] ? val[0] + val[1] : val[0]

    on_text(text)
  
end

#_rule_29(val) ⇒ Object



500
501
502
# File 'lib/oga/xml/parser.rb', line 500

def _rule_29(val)
   val[1] ? val[0] + val[1] : val[0] 
end

#_rule_3(val) ⇒ Object



374
375
376
# File 'lib/oga/xml/parser.rb', line 374

def _rule_3(val)
  val[0]
end

#_rule_30(val) ⇒ Object



504
505
506
# File 'lib/oga/xml/parser.rb', line 504

def _rule_30(val)
   nil 
end

#_rule_31(val) ⇒ Object



508
509
510
# File 'lib/oga/xml/parser.rb', line 508

def _rule_31(val)
   val[1] 
end

#_rule_32(val) ⇒ Object



512
513
514
# File 'lib/oga/xml/parser.rb', line 512

def _rule_32(val)
   val[1] 
end

#_rule_33(val) ⇒ Object



516
517
518
# File 'lib/oga/xml/parser.rb', line 516

def _rule_33(val)
   val[0] + val[1] 
end

#_rule_34(val) ⇒ Object



520
521
522
# File 'lib/oga/xml/parser.rb', line 520

def _rule_34(val)
   '' 
end

#_rule_35(val) ⇒ Object



524
525
526
# File 'lib/oga/xml/parser.rb', line 524

def _rule_35(val)
  val[0]
end

#_rule_36(val) ⇒ Object



528
529
530
# File 'lib/oga/xml/parser.rb', line 528

def _rule_36(val)
  val[0]
end

#_rule_37(val) ⇒ Object



532
533
534
# File 'lib/oga/xml/parser.rb', line 532

def _rule_37(val)
  val[0]
end

#_rule_38(val) ⇒ Object



536
537
538
# File 'lib/oga/xml/parser.rb', line 536

def _rule_38(val)
  val[0]
end

#_rule_39(val) ⇒ Object



540
541
542
# File 'lib/oga/xml/parser.rb', line 540

def _rule_39(val)
  val[0]
end

#_rule_4(val) ⇒ Object



378
379
380
# File 'lib/oga/xml/parser.rb', line 378

def _rule_4(val)
  val[0]
end

#_rule_40(val) ⇒ Object



544
545
546
# File 'lib/oga/xml/parser.rb', line 544

def _rule_40(val)
  val[0]
end

#_rule_41(val) ⇒ Object



548
549
550
# File 'lib/oga/xml/parser.rb', line 548

def _rule_41(val)
  val[0]
end

#_rule_42(val) ⇒ Object



552
553
554
# File 'lib/oga/xml/parser.rb', line 552

def _rule_42(val)
  val[0]
end

#_rule_5(val) ⇒ Object



382
383
384
# File 'lib/oga/xml/parser.rb', line 382

def _rule_5(val)
  val[0]
end

#_rule_6(val) ⇒ Object



386
387
388
# File 'lib/oga/xml/parser.rb', line 386

def _rule_6(val)
  val[0]
end

#_rule_7(val) ⇒ Object



390
391
392
# File 'lib/oga/xml/parser.rb', line 390

def _rule_7(val)
  val[0]
end

#_rule_8(val) ⇒ Object



394
395
396
# File 'lib/oga/xml/parser.rb', line 394

def _rule_8(val)
  val[0]
end

#_rule_9(val) ⇒ Object



398
399
400
401
402
403
404
405
406
407
408
# File 'lib/oga/xml/parser.rb', line 398

def _rule_9(val)
  
    on_doctype(
      :name         => val[1],
      :type         => val[2],
      :public_id    => val[3],
      :system_id    => val[4],
      :inline_rules => val[5]
    )
  
end

#after_element(element) ⇒ Oga::XML::Element

Parameters:

Returns:



341
342
343
# File 'lib/oga/xml/parser.rb', line 341

def after_element(element)
  element
end

#each_token {|| ... } ⇒ Object

Yields the next token from the lexer.

Yield Parameters:

  • (Array)


222
223
224
225
226
227
228
229
230
# File 'lib/oga/xml/parser.rb', line 222

def each_token
  @lexer.advance do |type, value, line|
    @line = line if line

    yield [type, value]
  end

  yield [-1, -1]
end

#on_attribute(name, ns_name = nil, value = nil) ⇒ Oga::XML::Attribute

Parameters:

  • name (String)
  • ns_name (String) (defaults to: nil)
  • value (String) (defaults to: nil)

Returns:



349
350
351
352
353
354
355
# File 'lib/oga/xml/parser.rb', line 349

def on_attribute(name, ns_name = nil, value = nil)
  Attribute.new(
    :namespace_name => ns_name,
    :name           => name,
    :value          => value
  )
end

#on_attributes(attrs) ⇒ Object

Parameters:

  • attrs (Array)


358
359
360
# File 'lib/oga/xml/parser.rb', line 358

def on_attributes(attrs)
  attrs
end

#on_cdata(text = nil) ⇒ Oga::XML::Cdata

Parameters:

  • text (String) (defaults to: nil)

Returns:



281
282
283
# File 'lib/oga/xml/parser.rb', line 281

def on_cdata(text = nil)
  Cdata.new(:text => text)
end

#on_comment(text = nil) ⇒ Oga::XML::Comment

Parameters:

  • text (String) (defaults to: nil)

Returns:



287
288
289
# File 'lib/oga/xml/parser.rb', line 287

def on_comment(text = nil)
  Comment.new(:text => text)
end

#on_doctype(options = {}) ⇒ Object

Parameters:

  • options (Hash) (defaults to: {})


275
276
277
# File 'lib/oga/xml/parser.rb', line 275

def on_doctype(options = {})
  Doctype.new(options)
end

#on_document(children = []) ⇒ Oga::XML::Document

Parameters:

  • children (Array) (defaults to: [])

Returns:



256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
# File 'lib/oga/xml/parser.rb', line 256

def on_document(children = [])
  document = Document.new(:type => @lexer.html? ? :html : :xml)

  children.each do |child|
    if child.is_a?(Doctype)
      document.doctype = child

    elsif child.is_a?(XmlDeclaration)
      document.xml_declaration = child

    else
      document.children << child
    end
  end

  document
end

#on_element(namespace, name, attributes = {}) ⇒ Oga::XML::Element

Parameters:

  • namespace (String)
  • name (String)
  • attributes (Hash) (defaults to: {})

Returns:



320
321
322
323
324
325
326
327
328
# File 'lib/oga/xml/parser.rb', line 320

def on_element(namespace, name, attributes = {})
  element = Element.new(
    :namespace_name => namespace,
    :name           => name,
    :attributes     => attributes
  )

  element
end

#on_element_children(element, children = []) ⇒ Oga::XML::Element

Parameters:

Returns:



333
334
335
336
337
# File 'lib/oga/xml/parser.rb', line 333

def on_element_children(element, children = [])
  element.children = children

  element
end

#on_proc_ins(name, text = nil) ⇒ Oga::XML::ProcessingInstruction

Parameters:

  • name (String)
  • text (String) (defaults to: nil)

Returns:



294
295
296
# File 'lib/oga/xml/parser.rb', line 294

def on_proc_ins(name, text = nil)
  ProcessingInstruction.new(:name => name, :text => text)
end

#on_text(text) ⇒ Oga::XML::Text

Parameters:

  • text (String)

Returns:



312
313
314
# File 'lib/oga/xml/parser.rb', line 312

def on_text(text)
  Text.new(:text => text)
end

#on_xml_decl(attributes = []) ⇒ Oga::XML::XmlDeclaration

Parameters:

  • attributes (Array) (defaults to: [])

Returns:



300
301
302
303
304
305
306
307
308
# File 'lib/oga/xml/parser.rb', line 300

def on_xml_decl(attributes = [])
  options = {}

  attributes.each do |attr|
    options[attr.name.to_sym] = attr.value
  end

  XmlDeclaration.new(options)
end

#parser_error(stack_type, stack_value, token_type, token_value) ⇒ Object

Parameters:

  • stack_type (Fixnum)
  • stack_value (Fixnum)
  • token_type (Symbol)
  • token_value (String)

Raises:

  • (LL::ParserError)


236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# File 'lib/oga/xml/parser.rb', line 236

def parser_error(stack_type, stack_value, token_type, token_value)
  case id_to_type(stack_type)
  when :rule
    message = "Unexpected #{token_type} for rule #{stack_value}"
  when :terminal
    expected = id_to_terminal(stack_value)
    expected = TOKEN_ERROR_MAPPING[expected]   || expected
    got      = TOKEN_ERROR_MAPPING[token_type] || token_type
    message  = "Unexpected #{got}, expected #{expected} instead"
  when :eof
    message = 'Unexpected end of input'
  end

  message += " on line #{@line}"

  raise LL::ParserError, message
end