Example using XML source data from a URLΒΆ

Another example (cansas.py in the source distribution) shows how content can be scraped from a URL that provides XML (using the lxml package) and written as a reST table. This particular XML uses a namespace which we setup in the variable nsmap:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python

import io
import sys
from lxml import etree
try:
    # python 3
    from urllib.request import urlopen
except ImportError as _exc:
    # python 2
    from urllib2 import urlopen
sys.path.insert(0, '..')
from pyRestTable import Table

SVN_BASE_URL = 'http://www.cansas.org/svn/1dwg/trunk'
GITHUB_BASE_URL = 'https://raw.githubusercontent.com/canSAS-org/1dwg/master'
CANSAS_URL = '/'.join((GITHUB_BASE_URL, 'examples/cs_af1410.xml'))


def main():
    nsmap = dict(cs='urn:cansas1d:1.1')
    
    r = urlopen(CANSAS_URL).read().decode("utf-8")
    doc = etree.parse(io.StringIO(r))
    
    node_list = doc.xpath('//cs:SASentry', namespaces=nsmap)
    t = Table()
    t.labels = ['SASentry', 'description', 'measurements']
    for node in node_list:
        s_name, count = '', ''
        subnode = node.find('cs:Title', namespaces=nsmap)
        if subnode is not None:
            s = etree.tostring(subnode, method="text")
            s_name = node.attrib['name']
            count = len(node.xpath('cs:SASdata', namespaces=nsmap))
        title = s.strip().decode()
        t.rows += [[s_name, title, count]]
    
    return t


if __name__ == '__main__':
    table = main()
    # use "complex" since s_name might be empty string
    print(table.reST(fmt='complex'))

The output from this code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
10 SASentry elements in http://www.cansas.org/svn/1dwg/trunk/examples/cs_af1410.xml

+-----------+--------------------------------------+--------------+
| entry     | description                          | measurements |
+===========+======================================+==============+
| AF1410:10 | AF1410-10 (AF1410 steel aged 10 h)   | 2            |
+-----------+--------------------------------------+--------------+
| AF1410:8h | AF1410-8h (AF1410 steel aged 8 h)    | 2            |
+-----------+--------------------------------------+--------------+
| AF1410:qu | AF1410-qu (AF1410 steel aged 0.25 h) | 2            |
+-----------+--------------------------------------+--------------+
| AF1410:cc | AF1410-cc (AF1410 steel aged 100 h)  | 2            |
+-----------+--------------------------------------+--------------+
| AF1410:2h | AF1410-2h (AF1410 steel aged 2 h)    | 2            |
+-----------+--------------------------------------+--------------+
| AF1410:50 | AF1410-50 (AF1410 steel aged 50 h)   | 2            |
+-----------+--------------------------------------+--------------+
| AF1410:20 | AF1410-20 (AF1410 steel aged 20 h)   | 1            |
+-----------+--------------------------------------+--------------+
| AF1410:5h | AF1410-5h (AF1410 steel aged 5 h)    | 2            |
+-----------+--------------------------------------+--------------+
| AF1410:1h | AF1410-1h (AF1410 steel aged 1 h)    | 2            |
+-----------+--------------------------------------+--------------+
| AF1410:hf | AF1410-hf (AF1410 steel aged 0.5 h)  | 2            |
+-----------+--------------------------------------+--------------+

The resulting table is shown:

10 SASentry elements in http://www.cansas.org/svn/1dwg/trunk/examples/cs_af1410.xml

entry description measurements
AF1410:10 AF1410-10 (AF1410 steel aged 10 h) 2
AF1410:8h AF1410-8h (AF1410 steel aged 8 h) 2
AF1410:qu AF1410-qu (AF1410 steel aged 0.25 h) 2
AF1410:cc AF1410-cc (AF1410 steel aged 100 h) 2
AF1410:2h AF1410-2h (AF1410 steel aged 2 h) 2
AF1410:50 AF1410-50 (AF1410 steel aged 50 h) 2
AF1410:20 AF1410-20 (AF1410 steel aged 20 h) 1
AF1410:5h AF1410-5h (AF1410 steel aged 5 h) 2
AF1410:1h AF1410-1h (AF1410 steel aged 1 h) 2
AF1410:hf AF1410-hf (AF1410 steel aged 0.5 h) 2