Source code for pandalone.xleash._parse

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#
# Copyright 2014 European Commission (JRC);
# Licensed under the EUPL (the 'Licence');
# You may not use this work except in compliance with the Licence.
# You may obtain a copy of the Licence at: http://ec.europa.eu/idabc/eupl
"""
The syntax-parsing part *xleash*.

Prefer accessing the public members from the parent module.

.. currentmodule:: pandalone.xleash
"""

from __future__ import unicode_literals

from collections import namedtuple
import json
import re

from future.builtins import str
from urllib.parse import urldefrag
from past.builtins import basestring

import itertools as itt
import numpy as np


[docs]class Cell(namedtuple('Cell', ('row', 'col', 'brow', 'bcol'))):
    """
    A pair of 1-based strings, denoting the "A1" coordinates of a cell.

    The "num" coords (numeric, 0-based) are specified using numpy-arrays
    (:class:`Coords`).
    """

[docs]    def __new__(cls, row, col, brow=None, bcol=None):

        return super(cls, Cell).__new__(cls,
                                        row and row.upper(),
                                        col and col.upper(),
                                        brow and brow.upper(),
                                        bcol and bcol.upper())

[docs]    def __str__(self):
        r = self.row
        c = self.col
        try:
            c = int(c)
            s = 'R%sC%s' % (r, c, )
        except:
            s = '%s%s' % (c, r)
        return s

[docs]    def __repr__(self):
        if self.brow or self.bcol:
            s = super(Cell, self).__repr__()
        else:
            s = "Cell(row=%r, col=%r)" % (self.row, self.col)
        return s

Cell.__new__.__defaults__ = (None, None)
"""Make :class:`Cell` construct with missing 'brow', 'bcol' fields as `None`."""


[docs]class Edge(namedtuple('Edge', ('land', 'mov', 'mod'))):
    """
    All the infos required to :term:`target` a cell.

    An :term:`Edge` contains *A1* :class:`Cell` as `land`.

    :param Cell land: the :term:`landing-cell`
    :param str mov: use None for missing moves.
    :param str mod: one of (`+`, `-` or `None`)
    """
    __slots__ = ()

[docs]    def __new__(cls, land, mov=None, mod=None):
        return super(cls, Edge).__new__(cls, land, mov, mod)

[docs]    def __str__(self):
        return ('%s(%s%s)' % (self.land, self.mov, self.mod or '')
                if self.mov
                else str(self.land))

    # def __repr__(self):
    #     return "Edge('%s')" % str(self)

_topleft_Edge = Edge(Cell('^', '^'))
_bottomright_Edge = Edge(Cell('_', '_'))


[docs]def Edge_new(row, col, mov=None, mod=None, default=None):
    """
    Make a new `Edge` from any non-values supplied, as is capitalized, or nothing.

    :param str, None col:    ie ``A``
    :param str, None row:    ie ``1``
    :param str, None mov:    ie ``RU``
    :param str, None mod:    ie ``+``

    :return:    a `Edge` if any non-None
    :rtype:     Edge, None


    Examples::

        >>> Edge_new('1', 'a', 'Rul', '-')
        Edge(land=Cell(row='1', col='A'), mov='RUL', mod='-')
        >>> print(Edge_new('5', '5'))
        R5C5


    No error checking performed::

        >>> Edge_new('Any', 'foo', 'BaR', '+_&%')
        Edge(land=Cell(row='ANY', col='FOO'), mov='BAR', mod='+_&%')

        >>> print(Edge_new(None, None, None, None))
        None


    except were coincidental::

        >>> Edge_new(row=0, col=123, mov='BAR', mod=None)
        Traceback (most recent call last):
        AttributeError: 'int' object has no attribute 'upper'

        >>> Edge_new(row=0, col='A', mov=123, mod=None)
        Traceback (most recent call last):
        AttributeError: 'int' object has no attribute 'upper'
    """

    if col == row == mov == mod is None:
        return default

    return Edge(land=Cell(row, col),
                mov=mov and mov.upper(), mod=mod)

_encase_regex = re.compile(r'^\s*(?P<q>[/\\"$%&])(.+)(?P=q)\s*$', re.DOTALL)
_regular_xlref_regex = re.compile(
    r"""
    ^\s*(?:(?P<sh_name>[^!]+)?!)?                            # xl sheet name
    (?:                                                      # 1st-edge
        (?:
            (?:
                (?P<st_col>[A-Z]+|[_^])                      # A1-col
                (?P<st_row>[123456789]\d*|[_^])              # A1-row
            ) | (?:
                R(?P<st_row2>-?[123456789]\d*|[_^.])         # RC-row
                C(?P<st_col2>-?[123456789]\d*|[_^.])         # RC-col
            )
        )
        (?:\(
            (?P<st_mov>L|U|R|D|LD|LU|UL|UR|RU|RD|DL|DR)      # moves
            (?P<st_mod>[+?])?                                # move modifiers
            \)
        )?
    )?
    (?:(?P<colon>:)                                          # ':' needed if 2nd
        (?:                                                  # 2nd-edge
            (?:                                              # target
                (?:
                    (?P<nd_col>[A-Z]+|[_^.])                 # A1-col
                    (?P<nd_row>[123456789]\d*|[_^.])         # A1-row
                ) | (?:
                    R(?P<nd_row2>-?[123456789]\d*|[_^.])     # RC-row
                    C(?P<nd_col2>-?[123456789]\d*|[_^.])     # RC-col
                )
            )
            (?:\(
                (?P<nd_mov>L|U|R|D|LD|LU|UL|UR|RU|RD|DL|DR)  # moves
                (?P<nd_mod>[+?])?                            # move-modifiers
                \)
            )?
        )?
        (?:
            :(?P<exp_moves>[LURD?123456789]+)                # expansion moves
        )?
    )?
    (?:
        :\s*(?P<js_filt>[{"[].*)                             # filters
    )?$
    """,
    re.IGNORECASE | re.X | re.DOTALL)
"""The regex for parsing regular :term:`xl-ref`. """

_re_exp_moves_splitter = re.compile('([LURD]\d+)', re.IGNORECASE)

# TODO: Make exp_moves `?` work different from numbers.
_re_exp_moves_parser = re.compile(
    r"""
    ^(?P<moves>[LURD]+)                                  # primitive moves
    (?P<times>\?|\d+)?                                   # repetition times
    $""",
    re.IGNORECASE | re.X)


_excel_str_translator = str.maketrans('“”', '""')  # @UndefinedVariable
"""Excel use these !@#% chars for double-quotes, which are not valid JSON-strings!!"""

CallSpec = namedtuple('CallSpec', ['func', 'args', 'kwds'])
"""The :term:`call-specifier` for holding the parsed json-filters."""

CallSpec.__new__.__defaults__ = ([], {})
"""Make optional the last 2 fields of :class:`CallSpec` ``(args', kwds)`` ."""


[docs]def parse_call_spec(call_spec_values):
    """
    Parse :term:`call-specifier` from json-filters.

    :param call_spec_values:
        This is a *non-null* structure specifying some function call
        in the `filter` part, which it can be either:

        - string: ``"func_name"``
        - list:   ``["func_name", ["arg1", "arg2"], {"k1": "v1"}]``
          where the last 2 parts are optional and can be given in any order;
        - object: ``{"func": "func_name", "args": ["arg1"], "kwds": {"k":"v"}}``
          where the `args` and `kwds` are optional.

    :return:
        the 3-tuple ``func, args=(), kwds={}`` with the defaults as shown
        when missing.
    """
    def boolarr(l):
        return np.fromiter(l, dtype=bool)

    def parse_list(func, item1=None, item2=None):
        items = (item1, item2)
        isargs = boolarr(isinstance(a, list) for a in items)
        iskwds = boolarr(isinstance(a, dict) for a in items)
        isnull = boolarr(a is None for a in items)

        if isargs.all() or iskwds.all() or not (
                isargs ^ iskwds ^ isnull).all():
            msg = "Cannot decide `args`/`kwds` for call_spec(%s)!"
            raise ValueError(msg.format(call_spec_values))
        args, kwds = None, None
        if isargs.any():
            args = items[isargs.nonzero()[0][0]]
        if iskwds.any():
            kwds = items[iskwds.nonzero()[0][0]]
        return func, args, kwds

    def parse_object(func, args=None, kwds=None):
        return func, args, kwds

    try:
        if isinstance(call_spec_values, basestring):
            func, args, kwds = call_spec_values, None, None
        # elif isinstance(call_spec_values, (tuple, list)): ???
        elif isinstance(call_spec_values, list):
            func, args, kwds = parse_list(*call_spec_values)
        elif isinstance(call_spec_values, dict):
            func, args, kwds = parse_object(**call_spec_values)
        else:
            msg = "One of str, list or dict expected for call-spec({})!"
            raise ValueError(msg.format(call_spec_values))
    except ValueError:
        raise
    except Exception as ex:
        msg = "Cannot parse call-spec({}) due to: {}"
        raise ValueError(msg.format(call_spec_values, ex))

    if not isinstance(func, basestring):
        msg = "Expected a `string` for func({}) for call-spec({})!"
        raise ValueError(msg.format(func, call_spec_values))
    if args is None:
        args = []
    elif not isinstance(args, list):
        msg = "Expected a `list` for args({}) for call-spec({})!"
        raise ValueError(msg.format(args, call_spec_values))
    if kwds is None:
        kwds = {}
    elif not isinstance(kwds, dict):
        msg = "Expected a `dict` for kwds({}) for call-spec({})!"
        raise ValueError(msg.format(kwds, call_spec_values))

    return CallSpec(func, args, kwds)


[docs]def _repeat_moves(moves, times=None):
    """
    Returns an iterator that repeats `moves` x `times`, or infinite if unspecified.

    Used when parsing primitive :term:`directions`.

   :param str moves: the moves to repeat ie ``RU1D?``
   :param str times: N of repetitions. If `None` it means infinite repetitions.
   :return:    An iterator of the moves
   :rtype:     iterator

    Examples::

         >>> list(_repeat_moves('LUR', '3'))
         ['LUR', 'LUR', 'LUR']
         >>> list(_repeat_moves('ABC', '0'))
         []
         >>> _repeat_moves('ABC')  ## infinite repetitions
         repeat('ABC')
     """
    args = (moves,)
    if times is not None:
        args += (int(times), )
    return itt.repeat(*args)


[docs]def parse_expansion_moves(exp_moves):
    """
    Parse rect-expansion into a list of dir-letters iterables.

    :param exp_moves:
        A string with a sequence of primitive moves:
        es. L1U1R1D1
    :type xl_ref: str

    :return:
        A list of primitive-dir chains.
    :rtype: list


    Examples::

        >>> res = parse_expansion_moves('lu1urd?')
        >>> res
        [repeat('L'), repeat('U', 1), repeat('UR'), repeat('D', 1)]

        # infinite generator
        >>> [next(res[0]) for i in range(10)]
        ['L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L']

        >>> list(res[1])
        ['U']

        >>> parse_expansion_moves('1LURD')
        Traceback (most recent call last):
        ValueError: Invalid rect-expansion(1LURD) due to:
                'NoneType' object has no attribute 'groupdict'

    """
    try:
        res = _re_exp_moves_splitter.split(exp_moves.upper().replace('?', '1'))

        return [_repeat_moves(**_re_exp_moves_parser.match(v).groupdict())
                for v in res
                if v != '']

    except Exception as ex:
        msg = 'Invalid rect-expansion({}) due to: {}'
        raise ValueError(msg.format(exp_moves, ex))


def _parse_edge(gs, prefix, default_edge):
    row_a1, row_rc = gs.pop('%s_row' % prefix), gs.pop('%s_row2' % prefix)
    col_a1, col_rc = gs.pop('%s_col' % prefix), gs.pop('%s_col2' % prefix)
    return Edge_new(row_a1 or row_rc, col_a1 or col_rc,
                    gs.pop('%s_mov' % prefix),
                    gs.pop('%s_mod' % prefix), default_edge)


[docs]def parse_xlref_fragment(xlref_fragment):
    """
    Parses a :term:`xl-ref` fragment, anything to the left of the hash(`#`).

    :param str xlref_fragment:
            the url-fragment part of the :term:`xl-ref` string,
            including the ``'#'`` char.
    :return:
        dictionary containing the following parameters:

        - sheet: (str, int, None) i.e. ``sheet_name``
        - st_edge: (Edge, None) the 1st-ref, with raw cell
          i.e. ``Edge(land=Cell(row='8', col='UPT'), mov='LU', mod='-')``
        - nd_edge: (Edge, None) the 2nd-ref, with raw cell
          i.e. ``Edge(land=Cell(row='_', col='.'), mov='D', mod='+')``
        - exp_moves: (sequence, None), as i.e. ``LDL1`` parsed by
          :func:`parse_expansion_moves()`
        - js_filt: dict i.e. ``{"dims: 1}``

    :rtype: dict


    Examples::

        >>> res = parse_xlref_fragment('Sheet1!A1(DR+):Z20(UL):L1U2R1D1:'
        ...                             '{"opts":{}, "func": "foo"}')
        >>> sorted(res.items())
        [('call_spec', CallSpec(func='foo', args=[], kwds={})),
         ('exp_moves', 'L1U2R1D1'),
         ('nd_edge', Edge(land=Cell(row='20', col='Z'), mov='UL', mod=None)),
         ('opts', {}),
         ('sh_name', 'Sheet1'),
         ('st_edge', Edge(land=Cell(row='1', col='A'), mov='DR', mod='+'))]

    Shortcut for all sheet from top-left to bottom-right full-cells::

        >>> res = parse_xlref_fragment(':')
        >>> sorted(res.items())
        [('call_spec', None),
         ('exp_moves', None),
         ('nd_edge', Edge(land=Cell(row='_', col='_'), mov=None, mod=None)),
         ('opts', None),
         ('sh_name', None),
         ('st_edge', Edge(land=Cell(row='^', col='^'), mov=None, mod=None))]


    Errors::

        >>> parse_xlref_fragment('A1(DR)Z20(UL)')
        Traceback (most recent call last):
        SyntaxError: Not an `xl-ref` syntax: A1(DR)Z20(UL)

    """

    m = _regular_xlref_regex.match(xlref_fragment)
    if not m:
        raise SyntaxError('Not an `xl-ref` syntax: %s' % xlref_fragment)

    gs = m.groupdict()

    is_colon = gs.pop('colon')
    gs['st_edge'] = _parse_edge(gs, 'st', is_colon and _topleft_Edge)
    gs['nd_edge'] = _parse_edge(gs, 'nd', is_colon and _bottomright_Edge)
    assert is_colon or not gs['nd_edge'], (xlref_fragment, gs['nd_edge'])
    if gs['st_edge'] == gs['nd_edge'] == None:
        gs['st_edge'] = _topleft_Edge
        gs['nd_edge'] = _bottomright_Edge

    exp_moves = gs['exp_moves']
    gs['exp_moves'] = exp_moves

    js = gs.pop('js_filt', None)
    if js:
        try:
            js = json.loads(js)
        except ValueError as ex:
            msg = 'Filters are not valid JSON: %s\n  JSON: \n%s'
            raise ValueError(msg % (ex, js))

    opts = js.pop('opts', None) if isinstance(js, dict) else None
    if opts is not None and not isinstance(opts, dict):
        msg = 'Filter-opts({}) must be a json-object(dictionary)!'
        raise ValueError(msg.format(opts))
    gs['opts'] = opts
    gs['call_spec'] = parse_call_spec(js) if js else None

    return gs


[docs]def parse_xlref(xlref):
    """
    Like ``_parse_xlref()`` but tries also if `xlreaf` is encased by delimiter chars ``/\\"$%&``.

    .. seealso:: _encase_regex
    """
    try:
        res = _parse_xlref(xlref)
    except SyntaxError as ex:
        try:
            m = _encase_regex.match(xlref)
        except SyntaxError:
            raise ex
        else:
            if m:
                res = _parse_xlref(m.group(2))
                res['xl-ref'] = xlref
            else:
                raise ex

    return res


[docs]def _parse_xlref(xlref):
    """
    Parse a :term:`xl-ref` into a dict.

    :param str xlref:
            A url-string abiding to the :term:`xl-ref` syntax.
    :return:
            A dict with all fields, with None with those missing.
    :rtype: dict


    Examples::

        >>> res = parse_xlref('workbook.xlsx#Sheet1!A1(DR+):Z20(UL):L1U2R1D1:'
        ...                             '{"opts":{}, "func": "foo"}')
        >>> sorted(res.items())
         [('call_spec', CallSpec(func='foo', args=[], kwds={})),
         ('exp_moves', 'L1U2R1D1'),
         ('nd_edge', Edge(land=Cell(row='20', col='Z'), mov='UL', mod=None)),
         ('opts', {}),
         ('sh_name', 'Sheet1'),
         ('st_edge', Edge(land=Cell(row='1', col='A'), mov='DR', mod='+')), ('url_file', 'workbook.xlsx'), ('xl_ref', 'workbook.xlsx#Sheet1!A1(DR+):Z20(UL):L1U2R1D1:{"opts":{}, "func": "foo"}')]

    Shortcut for all sheet from top-left to bottom-right full-cells::

        >>> res=parse_xlref('#:')
        >>> sorted(res.items())
        [('call_spec', None),
         ('exp_moves', None),
         ('nd_edge', Edge(land=Cell(row='_', col='_'), mov=None, mod=None)),
         ('opts', None),
         ('sh_name', None),
         ('st_edge', Edge(land=Cell(row='^', col='^'), mov=None, mod=None)),
         ('url_file', None),
         ('xl_ref', '#:')]


    Errors::

        >>> parse_xlref('A1(DR)Z20(UL)')
        Traceback (most recent call last):
        SyntaxError: No fragment-part (starting with '#'): A1(DR)Z20(UL)

        >>> parse_xlref('#A1(DR)Z20(UL)')          ## Missing ':'.
        Traceback (most recent call last):
        SyntaxError: Not an `xl-ref` syntax: A1(DR)Z20(UL)

    But as soon as syntax is matched, subsequent errors raised are
    :class:`ValueErrors`::

        >>> parse_xlref("#A1:B1:{'Bad_JSON_str'}")
        Traceback (most recent call last):
        ValueError: Filters are not valid JSON:
        Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
          JSON:
        {'Bad_JSON_str'}
    """
    xlref = xlref.translate(_excel_str_translator)
    url_file, frag = urldefrag(xlref)
    if not frag:
        raise SyntaxError(
            "No fragment-part (starting with '#'): %s" % xlref)
    res = parse_xlref_fragment(frag)
    frag = frag.strip()
    res['url_file'] = url_file or None
    res['xl_ref'] = xlref

    return res