Source code for pandalone.xleash.io._xlrd

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#
# Copyright 2014-2019European Commission (JRC);
# Licensed under the EUPL (the 'Licence');
# You may not use this work except in compliance with the Licence.
# You may obtain a copy of the Licence at: http://ec.europa.eu/idabc/eupl
"""
Implements the *xlrd* backend of *xleash* that reads in-file Excel-spreadsheets.

.. currentmodule:: pandalone.xleash
"""

import datetime
from distutils.version import LooseVersion
import logging

from urllib import request
from urllib.parse import urlparse
from pandalone.xleash.io.backend import ABCBackend, ABCSheet, SheetId
from xlrd import (
    xldate,
    XL_CELL_DATE,
    XL_CELL_EMPTY,
    XL_CELL_TEXT,
    XL_CELL_BLANK,
    XL_CELL_ERROR,
    XL_CELL_BOOLEAN,
    XL_CELL_NUMBER,
)
import xlrd

import numpy as np

from .. import EmptyCaptureException, Coords, io_backends
from ... import utils, xlsutils


log = logging.getLogger(__name__)

# noinspection PyUnresolvedReferences
if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
    _xlrd_0_9_3 = True
else:
    _xlrd_0_9_3 = False


[docs]def _parse_cell(xcell, epoch1904=False): """ Parse a xl-xcell. :param xlrd.Cell xcell: an excel xcell :type xcell: xlrd.sheet.Cell :param epoch1904: Which date system was in force when this file was last saved. False => 1900 system (the Excel for Windows default). True => 1904 system (the Excel for Macintosh default). :type epoch1904: bool :return: formatted xcell value :rtype: int, float, datetime.datetime, bool, None, str, datetime.time, float('nan') Examples:: >>> import xlrd >>> from xlrd.sheet import Cell >>> _parse_cell(Cell(xlrd.XL_CELL_NUMBER, 1.2)) 1.2 >>> _parse_cell(Cell(xlrd.XL_CELL_DATE, 1.2)) datetime.datetime(1900, 1, 1, 4, 48) >>> _parse_cell(Cell(xlrd.XL_CELL_TEXT, 'hi')) 'hi' """ ctype = xcell.ctype cvalue = xcell.value if ctype == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less suprising cint = int(cvalue) if cint == cvalue: return cint return cvalue elif ctype in (XL_CELL_EMPTY, XL_CELL_BLANK): return None # RECT-LOOP NEVER USE THIS elif ctype == XL_CELL_TEXT: return cvalue elif ctype == XL_CELL_BOOLEAN: return bool(cvalue) elif ctype == XL_CELL_DATE: # modified from Pandas library if _xlrd_0_9_3: # Use the newer xlrd datetime handling. d = xldate.xldate_as_datetime(cvalue, epoch1904) # Excel doesn't distinguish between dates and time, so we treat # dates on the epoch as times only. Also, Excel supports 1900 and # 1904 epochs. epoch = (1904, 1, 1) if epoch1904 else (1899, 12, 31) if (d.timetuple())[0:3] == epoch: d = datetime.time(d.hour, d.minute, d.second, d.microsecond) else: # Use the xlrd <= 0.9.2 date handling. d = xldate.xldate_as_tuple(xcell.value, epoch1904) if d[0] < datetime.MINYEAR: # time d = datetime.time(*d[3:]) else: # date d = datetime.datetime(*d) return d elif ctype == XL_CELL_ERROR: return float("nan") raise ValueError( "Invalid XL-cell type(%s) for value(%s)!" % (xcell.ctype, xcell.value) )
[docs]def _open_sheet_by_name_or_index(xlrd_book, wb_id, sheet_id): """ :param int or str or None sheet_id: If `None`, opens 1st sheet. """ if sheet_id is None: sheet_id = 0 if isinstance(sheet_id, int): xl_sh = xlrd_book.sheet_by_index(sheet_id) else: try: xl_sh = xlrd_book.sheet_by_name(sheet_id) except Exception as xl_ex: try: sheet_id = int(sheet_id) except ValueError: raise xl_ex from None else: xl_sh = xlrd_book.sheet_by_index(sheet_id) return XlrdSheet(xl_sh, wb_id)
[docs]class XlrdSheet(ABCSheet): """ The *xlrd* workbook wrapper required by xleash library. """
[docs] def __init__(self, sheet, book_fname, epoch1904=False): if not isinstance(sheet, xlrd.sheet.Sheet): raise ValueError("Invalid xlrd-sheet({})".format(sheet)) self._sheet = sheet self._epoch1904 = epoch1904 self.book_fname = book_fname
[docs] def _close(self): """ Override it to release resources for this sheet.""" self._sheet.book.unload_sheet(self._sheet.name)
[docs] def _close_all(self): """ Override it to release resources this and all sibling sheets.""" self._sheet.book.release_resources()
[docs] def get_sheet_ids(self): sh = self._sheet return SheetId(self.book_fname or sh.book.filestr, [sh.name, sh.number])
[docs] def open_sibling_sheet(self, sheet_id): """Gets by-index only if `sheet_id` is `int`, otherwise tries both by name and index.""" return _open_sheet_by_name_or_index(self._sheet.book, self.book_fname, sheet_id)
[docs] def list_sheetnames(self): return self._sheet.book.sheet_names()
[docs] def _read_states_matrix(self): """See super-method. """ types = np.asarray(self._sheet._cell_types) return (types != XL_CELL_EMPTY) & (types != XL_CELL_BLANK)
[docs] def _read_margin_coords(self): nrows = self._sheet.nrows - 1 ncols = self._sheet.ncols - 1 if nrows < 0 or ncols < 0: raise EmptyCaptureException("empty sheet") return None, Coords(nrows, ncols)
[docs] def read_rect(self, st, nd): """See super-method. """ sheet = self._sheet if nd is None: return _parse_cell(sheet.cell(*st), self._epoch1904) rect = np.array([st, nd]) + [[0, 0], [1, 1]] states_matrix = self.get_states_matrix() table = [] for r in range(*rect[:, 0]): row = [] table.append(row) for c in range(*rect[:, 1]): try: if states_matrix[r, c]: c = _parse_cell(sheet.cell(r, c), self._epoch1904) row.append(c) continue except IndexError: pass row.append(None) return table
[docs]class XlrdBackend(ABCBackend):
[docs] def bid(self, wb_url): if wb_url: parts = urlparse(wb_url) path = utils.urlpath2path(parts.path) if xlsutils._xl_extensions_anywhere.search(path): return 100
[docs] def open_sheet(self, wb_url, sheet_id): """ Opens the local or remote `wb_url` *xlrd* workbook wrapped as :class:`XlrdSheet`. """ assert wb_url, (wb_url, sheet_id) book = self._open_book(wb_url) return _open_sheet_by_name_or_index(book, wb_url, sheet_id)
[docs] def list_sheetnames(self, wb_url): # TODO: QnD list_sheetnames()! book = self._open_book(wb_url) return book.sheet_names()
def _open_book(self, url): parts = filename = urlparse(url) ropts = parts.params or {} if "logfile" not in ropts: ropts["logfile"] = utils.LoggerWriter(log, logging.DEBUG) if parts.scheme == "file": path = utils.urlpath2path(parts.path) log.info("Opening book(%r)...", path) book = xlrd.open_workbook(path, **ropts) else: ropts.pop("on_demand", None) http_opts = ropts.get("http_opts", {}) with request.urlopen(url, **http_opts) as response: log.info("Opening book(%r)...", filename) book = xlrd.open_workbook(filename, file_contents=response, **ropts) return book
def load_as_xleash_plugin(): loaded = [be for be in io_backends if isinstance(be, XlrdBackend)] if not loaded: io_backends.insert(0, XlrdBackend())