Source code for cobralib.io

# Import necessary packages here
import json
import logging
import logging.handlers
import os
import re
import xml.etree.ElementTree as ET
from collections import deque
from typing import Any, Union

import pandas as pd
import pdfplumber
import xmltodict
import yaml

# ==========================================================================================
# ==========================================================================================

# File:    io.py
# Date:    July 09, 2023
# Author:  Jonathan A. Webb
# Purpose: This file contains classes and functions that can be used to read and write
#          to files
# ==========================================================================================
# ==========================================================================================
# Insert Code here


[docs]class ReadYAML: """ :param file_name: The name and path length for the file with the yaml-like format :raises FileNotFoundError: If the file does not exist. This class can be used to read a file woith a YAML-like format. This class is tailoered to read basic YAML files, but with looser requirements on how key words are formatted, and stricter requirements on data typing. The methods within this class can be used to read scalar variables from key-variable pairs, lists, and flat dictionaries. This class also enforces type casting for all variables read into memory. This class is more meory efficient than using PyYAML, since it only reads the requested lines to memory. All code examples described in the documentation for this class reference the read_yaml.yaml file shown below. .. literalinclude:: ../../../data/test/read_yaml.yaml :language: text """ def __init__(self, file_name: str): if not os.path.isfile(file_name): raise FileNotFoundError(f"FATAL ERROR: {file_name} does not exist") self._file_name = file_name self.__yamllines = self._read_yamllines() # ------------------------------------------------------------------------------------------
[docs] def read_key_value( self, keyword: str, data_type: type, document_index: int = 0 ) -> Any: """ :param keyword: The keyword associated with the value to be read in. Unlike a pure YAML file this value does not have to end with a : symbol :param data_type: The data type of the value to be read in :param document_index: The number of the yaml document in the yaml file. :return value: The value associated with a keyword :raise ValueError: If the value can not be cast to the user defined type This method can be used to read a key-value pair from a yaml or yaml-like file. This method will rcognize the >, ^, and | symbols that symbolize strings that either start on the next line, or multiline strings. Example 1 --------- An example of a python code to read an float value from the 1st yaml document. .. code-block:: python from cobralib.io import ReadYAML reader = ReadYAML('read_yaml.yaml') value = reader.read_key_value('key:', float, 0) print(value) >> 4.387 Example 2 --------- An example to read a multiline string value from the second yaml document in the file .. code-block:: python from cobralib.io import ReadYAML reader = ReadYAML('read_yaml.yaml') value = reader.read_key_value('Multi Sentence:', str, 1) new_value = reader.read_key_value('Second Mult Sentence:', str, 1) print(value) print(new_value) .. code-block:: bash >> This is a multiline sentence, there is no reason to worry! >> This is a multiline sentence, there is no reason to worry! Example 3 --------- An example that shows the different way boolean values can be read into memory. A value of True, on, or yes will equate to True and values of False, off, no will equate to False. The values in the yaml-like file are case insensitive. .. code-block:: python from cobralib.io import ReadYAML reader = ReadYAML('read_yaml.yaml') true_value = reader.read_key_value('bool test1:', bool, 1) yes_value = reader.read_key_value('bool test4:', bool, 1) on_value = reader.read_key_value('bool test5:', bool, 1) false_value = reader.read_key_value('bool test2:', bool, 1) no_value = reader.read_key_value('bool test3:', bool, 1) off_value = reader.read_key_value('bool test6:', bool, 1) .. code-block:: bash >> True >> True >> True >> False >> False >> False """ yaml_docs = self._read_yaml_documents() self._check_document_length(document_index, yaml_docs) lines = yaml_docs[document_index].split("\n") for i, line in enumerate(lines): stripped_line = line.lstrip() if stripped_line.startswith(keyword): keyword_indent = len(line) - len(stripped_line) value_str = stripped_line[len(keyword) :].strip() return self._parse_value( value_str, lines[i + 1 :], keyword_indent, data_type ) raise ValueError(f"Keyword '{keyword}' not found in the specified document")
# ------------------------------------------------------------------------------------------
[docs] def read_yaml_list( self, keyword: str, data_type: type, document_index: int = 0 ) -> list[Any]: """ :param keyword: The keyword associated with the value to be read in. Unlike a pure YAML file, this value does not have to end with a : symbol. :param data_type: The data type of the value to be read in :param document_index: The number of the yaml document in the yaml file. :return value: The list associated with a keyword :raise ValueError: If the value can not be cast to the user defined type This method can be used to read a key-value pair from a yaml or yaml-like file where the value is a list of values. This method will rcognize the >, ^, and | symbols that symbolize strings that either start on the next line, or multiline strings. Example 1 --------- An example of a python code to read a list of integer values from the 1st yaml document. .. code-block:: python from cobralib.io import ReadYAML reader = ReadYAML('read_yaml.yaml') list_values = reader.read_yaml_list('First List:', int, 0) print(list_values) .. code-block:: bash >> [1.1, 2.2, 3.3, 4.4] Example 2 --------- This method will also read string values from the list that may use the ^, > or | symbols that signify the string as starting on the next line, a multi-line string that should be read into one line, or a multiline string that should be read as a multiline string. .. code-block:: python from cobralib.io import ReadYAML reader = ReadYAML('read_yaml.yaml') list_values = reader.read_yaml_list('Numbers:', int, 0) print(list_values) .. code-block:: text >> ['Hello World This is Jon', 'This', 'Is', 'Correct'] """ yaml_docs = self._read_yaml_documents() self._check_document_length(document_index, yaml_docs) lines = yaml_docs[document_index].split("\n") values = [] is_reading_list = False keyword_indent = 0 i = 0 while i < len(lines): line = lines[i] stripped_line = line.lstrip() current_indent = len(line) - len(stripped_line) if stripped_line.startswith(keyword): keyword_indent = current_indent is_reading_list = True # Check for an inline list rest_of_line = stripped_line[len(keyword) :].strip() if rest_of_line.startswith("[") and rest_of_line.endswith("]"): inline_list = rest_of_line[1:-1].split(",") for x in inline_list: try: values.append(data_type(x.strip())) except ValueError: raise ValueError("Invalid value") return values i += 1 # Move to the next line continue if is_reading_list: if current_indent <= keyword_indent: break # Add list items if stripped_line.startswith("-"): value_str = stripped_line[1:].strip() # Remove "-" and leading spaces # Check for special string types if value_str in ["^", ">", "|"]: complex_str = value_str value_str = "" i += 1 while i < len(lines): next_line = lines[i] next_indent = len(next_line) - len(next_line.lstrip()) if next_indent <= current_indent: i -= ( 1 # Step back to let the outer loop process this line ) break if complex_str == "^": value_str = next_line.strip() break elif complex_str == "|": value_str += next_line.strip() + "\n" elif complex_str == ">": next_line_content = next_line[current_indent:].lstrip() value_str += next_line_content + " " i += 1 if complex_str == ">": value_str = value_str.rstrip() try: values.append(data_type(value_str)) except ValueError: raise ValueError("Invalid value") i += 1 msg = f"Keyword '{keyword}' not found or it is " msg += "not a list in the specified document." if not is_reading_list: raise ValueError(msg) return values
# ------------------------------------------------------------------------------------------
[docs] def read_yaml_dict( self, keyword: str, key_data_type: type, value_data_type: type, document_index: int = 0, ) -> dict: """ :param keyword: The keyword associated with the value to be read in. Unlike a pure YAML file, this value does not have to end with a : symbol. :param key_data_type: The data type of the key value. :param value_data_type: The data type of the value to be read in :param document_index: The number of the yaml document in the yaml file. :return value: The dictionary associated with a keyword :raise ValueError: If the value can not be cast to the user defined type This method can be used to read a key-value pair from a yaml or yaml-like file where the value is a dictionary of values. This method will recognize the >, ^, and | symbols that symbolize strings that either start on the next line, or multiline strings. **NOTE:** This method assumes a flat (i.e. not nested) dictionary structure. Example 1 --------- An example of a python code to read a list of integer values from the 1st yaml document. .. code-block:: python from cobralib.io import ReadYAML reader = ReadYAML('read_yaml.yaml') value = reader.read_yaml_dict('Ages:', 'str', 'int', 1) print(value) .. code-block:: text >> {'Jon': 44. 'Jill': 32, 'Bob': 12} """ yaml_docs = self._read_yaml_documents() self._check_document_length(document_index, yaml_docs) lines = yaml_docs[document_index].split("\n") found_dict = {} is_reading_dict = False keyword_indent = None val_str = "" i = 0 while i < len(lines): line = lines[i] stripped_line = line.lstrip() current_indent = len(line) - len(stripped_line) if stripped_line.startswith(keyword): is_reading_dict = True keyword_indent = current_indent i += 1 # Move to the next line continue if is_reading_dict: if current_indent <= keyword_indent: break # Ensure this line is part of the dict if ":" in stripped_line: key_str, value_str = map(str.strip, stripped_line.split(":", 1)) key = self._parse_value(key_str, [], current_indent, key_data_type) if value_str in ["^", ">", "|"]: value_str = "" i += 1 while i < len(lines): next_line = lines[i] next_indent = len(next_line) - len(next_line.lstrip()) if next_indent <= current_indent: i -= 1 break value_str += next_line + "\n" # Add the line to value_str i += 1 # Remove trailing newline and pass all lines for parsing val_str = value_str value_str = value_str.rstrip("\n") value = self._parse_value( value_str, value_str.split("\n"), current_indent, value_data_type, ) value = self._remove_uniform_indent(value) else: value = self._parse_value( value_str, [], current_indent, value_data_type ) if val_str in [">", "|"]: print("YES") value = self._remove_uniform_indent(value) found_dict[key] = value i += 1 msg = f"Keyword '{keyword}' not found or it is not a " msg += " dictionary in the specified document." if not is_reading_dict: raise ValueError(msg) return found_dict
# ------------------------------------------------------------------------------------------
[docs] def read_full_yaml(self, safe_read: bool = True) -> Any: """ Reads the full YAML file and returns it as a PyYAML object. :params safe_read: Whether to read the file in a safe more or not. Defaulted to True :return Any: The full content of the YAML file as a PyYAML object. This method assumes the possibility of multiple documents in one file. The result is returned as a list Unlike other methods in this class, this method will read an entire yaml file into memory and return a PyYaml object. This is not as memory efficient as the other methods, but this will make the accessing of data quicker for larger files. In addition, the user must adhere to the strict rules of YAML when using this method. The rules for a PyYaml class can be found at `PyYaml <https://pyyaml.org/wiki/PyYAMLDocumentation>`_. Example 1 --------- An example of a python code to read a list of integer values from the 1st yaml document. .. code-block:: python from cobralib.io import ReadYAML reader = ReadYAML('read_yaml.yaml') data = reader.read_full_yaml() # Read in as safe mode print(data[1]['Ages']) .. code-block:: text >> {'Jon': 44. 'Jill': 32, 'Bob': 12} """ with open(self._file_name) as file: if safe_read: return list(yaml.safe_load_all(file)) else: return list(yaml.load_all(file))
# ------------------------------------------------------------------------------------------
[docs] def read_yaml_dict_of_list( self, keyword: str, key_data_type: type, list_data_type: type, document_index: int = 0, ) -> dict: """ :param keyword: The keyword associated with the value to be read in. Unlike a pure YAML file, this value does not have to end with a : symbol. :param key_data_type: The data type of the key value. :param list_data_type: The data type of the value to be read in :param document_index: The number of the yaml document in the yaml file. :return value: The dictionary associated with a keyword :raise ValueError: If the value can not be cast to the user defined type This method can be used to read a key-value pair from a yaml or yaml-like file where the value is a dictionary of lists. This method will recognize the >, ^, and | symbols that symbolize strings that either start on the next line, or multiline strings. **NOTE:** This method assumes a flat (i.e. not nested) dictionary structure. Example 1 --------- An example of a python code to read a dictionary of integer list values from the 1st yaml document. .. code-block:: python from cobralib.io import ReadYAML reader = ReadYAML('read_yaml.yaml') value = reader.read_yaml_dict('Dict List:', 'str', 'int', 0) print(value) .. code-block:: text >> {'One': [1, 2, 3], 'Two': [3, 4, 5], 'Three': [6, 7, 8]} """ yaml_docs = self._read_yaml_documents() self._check_document_length(document_index, yaml_docs) lines = iter(yaml_docs[document_index].split("\n")) # Convert to an iterator is_reading_dict = False keyword_indent = None current_dict = {} current_list = None for line in lines: stripped_line = line.lstrip() current_indent = len(line) - len(stripped_line) if stripped_line.startswith(keyword): is_reading_dict = True keyword_indent = current_indent continue if is_reading_dict: if current_indent <= keyword_indent: break if ":" in stripped_line: key, value = map(str.strip, stripped_line.split(":", 1)) key = key_data_type(key) if value.startswith("[") and value.endswith("]"): current_list = [ list_data_type(v.strip()) for v in value[1:-1].split(",") ] current_dict[key] = current_list current_list = None else: current_list = [] current_dict[key] = current_list elif stripped_line.startswith("-"): value_str = stripped_line[1:].strip() complex_str = None if value_str in ["^", ">", "|"]: complex_str = value_str value_str = self._parse_block_scalar( lines, current_indent, complex_str ) current_list.append(list_data_type(value_str)) msg = f"Keyword '{keyword}' not found or it is not " msg += "dictionary of lists in the specified document." if not is_reading_dict: raise ValueError(msg) return current_dict
# ========================================================================================== # PRIVATE-LIKE methods def _read_yamllines(self): """ This private method will read in all lines from the text file """ with open(self._file_name) as file: lines = [line.rstrip() for line in file] return lines # ------------------------------------------------------------------------------------------ def _read_yaml_documents(self): yaml_docs = list( filter(lambda x: x.strip(), "\n".join(self.__yamllines).split("---")) ) return yaml_docs # ------------------------------------------------------------------------------------------ def _check_document_length(self, document_index, yaml_docs) -> None: if document_index >= len(yaml_docs) or document_index < 0: raise ValueError( f"""Document index {document_index} out of range. File contains {len(yaml_docs)} documents.""" ) # ------------------------------------------------------------------------------------------ def _calculate_indent(self, line: str): return len(line) - len(line.lstrip()) # ------------------------------------------------------------------------------------------ def _parse_block_scalar(self, lines: iter, current_indent: int, complex_str: str): value_str = "" lines_iter = iter(lines) while True: line = next(lines_iter, "").rstrip() next_indent = self._calculate_indent(line) if next_indent <= current_indent: break line_content = line[next_indent:].lstrip() if complex_str == "^": value_str = line_content.strip() break elif complex_str == "|": value_str += line_content + "\n" elif complex_str == ">": value_str += line_content + " " return value_str.rstrip() if complex_str in ["|", ">"] else value_str # ------------------------------------------------------------------------------------------ def _remove_uniform_indent(self, multi_line_str: str) -> str: lines = multi_line_str.split("\n") # Calculate the minimum number of leading white spaces min_indent = float("inf") # Set to infinity initially for line in lines: stripped_line = line.lstrip() if stripped_line: # Ignore empty lines indent = len(line) - len(stripped_line) min_indent = min(min_indent, indent) # Remove the minimum indent from each line lines = [line[min_indent:] for line in lines] return "\n".join(lines) # ------------------------------------------------------------------------------------------ def _parse_value( self, value_str: str, subsequent_lines: list, keyword_indent: int, data_type: type ) -> Any: if data_type == bool: value_str = value_str.lower() if value_str.upper() in ["TRUE", "YES", "ON"]: return True elif value_str.upper() in ["FALSE", "NO", "OFF"]: return False else: raise ValueError("Invalid boolean value") if data_type == str and value_str in ["^", ">", "|"]: value_str = self._parse_block_scalar( subsequent_lines, keyword_indent, value_str ) try: return data_type(value_str) except ValueError: raise ValueError("Invalid value")
# ========================================================================================== # ==========================================================================================
[docs]class ReadJSON: """ :param file_name: The name and path length for the file with the json-like format. While not required, it is recommended that this file use a .jwc extension. :raises FileNotFoundError: If the file does not exist. This class can be used to read a file woith a JSON-like format. This class is tailoered to read basic JSON files, but with looser requirements on how key words are formatted, and stricter requirements on data typing. The methods within this class can be used to read scalar variables from key-variable pairs, lists, and flat dictionaries. The file containing json data can be a pure .json file, or it can be mixed with yaml like key value pairs. If the file is mixed, it is recommended that the file be defined with a .jwc extension. """ def __init__(self, file_name: str): if not os.path.isfile(file_name): raise FileNotFoundError(f"FATAL ERROR: {file_name} does not exist") self._file_name = file_name self.__jsonlines = self._read_jsonlines() # ------------------------------------------------------------------------------------------
[docs] def read_json(self, keyword: str) -> dict: """ Search each line for the specified keyword and read the JSON data to the right of the keyword until the termination of brackets. :param keyword: The keyword to search for in each line. :return: The JSON data as a dictionary. :raises ValueError: If the keyword is not found or if the JSON data is not valid. Example 1 --------- This example shows a file that mixes YAML and JSON data types. In order to delinate the file type that contains mixed data, it is recommended that the .jwc file format be used; however, it is not required. .. code-block:: text Yaml Dict: - 1 - 2 - 3 Yaml Key: Test String Yaml Dict: One: 1.1 Two: 2.2 Three: 3.3 Json Book Data: {"book": "History of the World", "year": 1976} .. code-block:: python from cobralib.io import ReadJSON # Instantiate the class reader = ReadJSON("test_key_words.jwc") value = reader.read_json("JSON Book Data:") print(value) .. code-block:: text >> {"book": "History of the World", "year": 1976} """ found_keyword = False json_data = "" bracket_count = 0 for line in self.__jsonlines: line = line.strip() # Remove leading and trailing whitespaces if found_keyword or line.startswith(keyword): if not found_keyword: json_data += line.split(keyword, 1)[-1].lstrip() found_keyword = True else: json_data += " " + line # Add a space to ensure proper formatting bracket_count += line.count("{") - line.count("}") # If we've found as many closing brackets as opening ones if bracket_count == 0: try: return json.loads(json_data) except json.JSONDecodeError as e: raise ValueError( f"Invalid JSON data for keyword '{keyword}': {e}" ) if not found_keyword: raise ValueError(f"Keyword '{keyword}' not found in the file") else: raise ValueError(f"Invalid JSON data for keyword '{keyword}'")
# ------------------------------------------------------------------------------------------
[docs] def read_full_json(self, keyword: str = None) -> Union[dict, list]: """ Read the entire contents of the file as JSON data. If a keyword is provided, search for that keyword and return the nested dictionaries beneath it. :param keyword: The keyword to search for in the file. If None, returns the entire JSON data. :return: The JSON data as a dictionary or list. :raises ValueError: If the keyword is specified but not found in the file. Unlike the read_json method, this method assumes the entire file is formatted as a .json file. This method will allow a user to read in the entire contents of the json file as a dictionary, or it will read in the dictionaries nested under a specific key word. If you assume the input file titled example.json has the following format Example 1 --------- .. code-block:: json { "key1": "value1", "key2": { "subkey1": "subvalue1", "subkey2": { "subsubkey1": "subsubvalue1", "subsubkey2": "subsubvalue2" } } } The code to extract data would look like: .. code-block:: python from cobralib.io import ReadJSON reader = ReadJSON("example.json") value = reader.read_full_json() print(value) new_value = reader.read_full_json("subkey2") print(new_value) .. code-block:: text >> { "key1": "value1", "key2": { "subkey1": "subvalue1", "subkey2": { "subsubkey1": "subsubvalue1", "subsubkey2": "subsubvalue2" } } } >> {"subsubkey1": "subsubvalue1", "subsubkey2": "subsubvalue2"} """ json_data = json.loads("\n".join(self.__jsonlines)) if keyword is None: return json_data def find_nested_dictionaries(data, keyword): if isinstance(data, dict): if keyword in data: return data[keyword] for value in data.values(): result = find_nested_dictionaries(value, keyword) if result is not None: return result elif isinstance(data, list): for item in data: result = find_nested_dictionaries(item, keyword) if result is not None: return result return None result = find_nested_dictionaries(json_data, keyword) if result is not None: return result else: raise ValueError(f"Keyword '{keyword}' not found in the JSON data")
# ========================================================================================== # PRIVATE-LIKE METHODS def _read_jsonlines(self): """ This private method will read in all lines from the text file """ with open(self._file_name) as file: lines = [line.rstrip() for line in file] return lines
# ========================================================================================== # ==========================================================================================
[docs]class ReadXML: """ :param file_name: The name and path length for the file with the xml-like format. While not required, it is recommended that this file either be an .xml or .jwc file. :raises FileNotFoundError: If the file does not exist. This class can be used to read a file woith a XML-like format. This class is tailoered to read basic XML files, but with looser requirements on how key words are formatted, and stricter requirements on data typing. The methods within this class can be used to read scalar variables from key-variable pairs, lists, and flat dictionaries. The file containing the XML data can contain traditional XML data or yaml-like key value pairs. If the file is mixed, it is recommended that the file be defined with a .jwc extension. """ def __init__(self, file_name: str): if not os.path.isfile(file_name): raise FileNotFoundError(f"FATAL ERROR: {file_name} does not exist") self._file_name = file_name self.__xmllines = self._read_xml_lines() # ------------------------------------------------------------------------------------------
[docs] def read_xml(self, keyword: str) -> dict: """ Search each line for the specified keyword and read the XML data to the right of the keyword until the termination of tags. :param keyword: The keyword to search for in each line. :return: The XML data as a dictionary. :raises ValueError: If the keyword is not found or if the XML data is not valid. Example 1 --------- .. code-block:: text Yaml Dict: - 1 - 2 - 3 Yaml Key: Test String Yaml Dict: One: 1.1 Two: 2.2 Three: 3.3 XML Book Data: <root> <book>"History of the World"</book> <Year>1976</Year> </root> .. code-block:: python from cobralib.io import ReadXML reader = ReadXML("example.jwc") value = reader.read_xml("XML Book Data") print(value) .. code-block:: text >> {"book": "History of the World", "year": 1976} """ found_keyword = False xml_data = "" collect_lines = False root_tag = None # Root tag of the XML data for line in self.__xmllines: if line.startswith(keyword): found_keyword = True collect_lines = True # Start collecting lines remaining_line = line.split(keyword)[-1].strip() xml_data += remaining_line # Try to find the root tag from this line match = re.search("<([^/> ]+)", remaining_line) if match: root_tag = match.group(1) elif collect_lines: xml_data += line.strip() # Add line to xml_data # If root_tag is still None, try to find it from this line if root_tag is None: match = re.search("<([^/> ]+)", line) if match: root_tag = match.group(1) # Stop collecting lines if we find the closing root tag if root_tag is not None and f"</{root_tag}>" in line: break if not found_keyword: raise ValueError(f"Keyword '{keyword}' not found in the file") if xml_data.startswith("<") and xml_data.endswith(">"): try: return xmltodict.parse(xml_data) except Exception: # Catch any XML parsing errors raise ValueError(f"Invalid XML data for keyword '{keyword}'") else: raise ValueError(f"Invalid XML data for keyword '{keyword}'")
# ------------------------------------------------------------------------------------------
[docs] def read_full_xml(self, keyword: str = None): """ Read the XML data. If a keyword is provided, search for the specified keyword in the XML data and return the nested elements beneath it. If no keyword is provided, return the full XML data. :param keyword: The keyword to search for in the XML data. :return: The XML data as a dictionary object or the nested elements as an ElementTree object if a keyword is provided. :raises ValueError: If the keyword is specified but not found in the XML data. If you assume the input file titled example.xml has the following format: Example 1 --------- .. code-block:: xml <root> <key1>value1</key1> <key2> <subkey1>subvalue1</subkey1> <subkey2> <subsubkey1>subsubvalue1</subsubvalue1> <subsubkey2>subsubvalue2</subsubvalue2> </subkey2> </key2> </root> The code to extract data would look like: .. code-block:: python from cobralib.io import ReadXML reader = ReadXML("example.xml") value = reader.read_full_xml() print(value) >> { "root": { "key1": "value1", "key2": { "subkey1": "subvalue1", "subkey2": { "subsubkey1": "subsubvalue1", "subsubkey2": "subsubvalue2" } } } } new_value = reader.read_full_xml("subkey2") print(new_value) .. code-block:: text >> { "subkey1": "subvalue1", "subkey2": { "subsubkey1": "subsubvalue1", "subsubkey2": "subsubvalue2" } } """ tree = ET.parse(self._file_name) root = tree.getroot() if keyword is None: xml_string = ET.tostring(root, encoding="utf-8").decode() return xmltodict.parse(xml_string) else: elements = root.findall(f".//{keyword}") if elements: xml_string = ET.tostring(elements[0], encoding="utf-8").decode() return xmltodict.parse(xml_string) else: raise ValueError(f"Keyword '{keyword}' not found in the XML data")
# ========================================================================================== # PRIVATE-LIKE METHODS def _read_xml_lines(self): """ This private method will read in all lines from the text file """ with open(self._file_name) as file: lines = [line.rstrip() for line in file] return lines
# ========================================================================================== # ==========================================================================================
[docs]class ReadKeyWords(ReadYAML, ReadJSON, ReadXML): """ This class is a container for the ReadYAML, ReadJSON, and ReadXML classes. This class is developed specifically to read .jwc file types, which can mix JSON, XML, and YAML formats. Thsi file can be used to read a straight XML, JSON, or YAML file. :param file_name: The file name to be read including the path length :param print_lines: The number of lines to be printed to the screen if the user prints an instance of the class. Defaulted to 50 :raises FileNotFoundError: If the file does not exist Example File ------------ .. code-block:: text --- # First document in file Float Value: 4.387 Double Value: 1.11111187 integer: 6 String: Hello Float List: [1.1 2.2 3.3 4.4] Yaml Block List: - 1 - 2 - 3 - 4 Yaml Dict: First Key: 3.3 Second Key: 4.4 Third Key: 5.5 Fourth Key: 6.6 String List Hello World How are you JSON Data: {"book": "History of the World, "Year": 1976} XML Data: <root> <book>"History of the World"</book> <Year>1976</Year> </root> --- # Second document in file # Notice that a : character is not required Another Int 3 Instantiation Example --------------------- .. code-block:: python # Instantiate the class from io.cobralib import ReadKey Words reader = ReadKeyWords("test_key_words.jwc", print_lines=2) # Print the instance, displaying 2 lines print(reader) .. code-block:: bash >> Float Value: 4.387 # Comment line not to be read >> Double Value: 1.11111187 # Comment line not to be read The user can also adjust the print_lines attribute after instantiation if they wish to change the number of printed lines Read Scalar Values ------------------ This class can be used to read in key value pairs. .. code-block:: python # Instantiate the class from io.cobralib import ReadKey Words reader = ReadKeyWords("test_key_words.jwc") int_value = reader.read_key_value("integer:", int) double_value = reader.read_key_value("Double Value:", np.float64) # Read from second document in file second_doc = reader.read_key_value("Another Int", int, 1) print("Integer Value: ", int_value) print(type) print("Double Value: ", double_value) print(type) print(second_doc) .. code-block:: bash >> Integer Value: 6 >> int >> Double Value: 1.11111187 >> np.float64 >> 3 Read List Values ---------------- This class can be used to read in lists stored inline or in block formats .. code-block:: python # Instantiate the class from io.cobralib import ReadKey Words reader = ReadKeyWords("test_key_words.jwc") inline_list = reader.read_key_value("Float List:", float) block_list = reader.read_key_value("Yaml Block List:", int) print("Inline List: ", inline_list) print("Block List: ", block_list) .. code-block:: bash >> Inline List: [ 1.1, 2.2, 3.3, 4.4 ] >> Block List: [ 1, 2, 3, 4 ] Read JSON and XML ----------------- This class can be used to read JSON and XML data associated with key words .. code-block:: python # Instantiate the class from io.cobralib import ReadKey Words reader = ReadKeyWords("test_key_words.jwc") json_data = reader.read_json("JSON Data:") xml_data = reader.read_xml("XML Data:") print("JSON Data: ", json_data) print("XML Data: ", xml_data) .. code-block:: bash >> JSON Data: {"book": "History of the World", "Year", 1976} >> XML Data: {"book": "History of the World", "Year", 1976} Read YAML Dictionaries ---------------------- This class can be used to read dictionaries encoded in YAML formats. Unlike JSON and XML, dictionaries read in from a YAML format must be flat (i.e. no nested dictionaries) and of a uniform data type. .. code-block:: python # Instantiate the class from io.cobralib import ReadKey Words reader = ReadKeyWords("test_key_words.jwc") yaml_dict = reader.read_yaml_dict("Yaml Dict:", str, float) print("YAML Dictionary: ", yaml_dict) .. code-block:: bash >> YAML Dictionary: {"First Key": 3.3, "Second Key": 4.4, "Third Key": 5.5, "Fourth Key": 6.6} **Note:** In order to read in a ditionary of lists, use the ``read_yaml_dict_of_list`` method. YAML, JSON, and XML Files ------------------------- If you wish to read a .yaml, .josn, or .xml file that does not contain mixed data, you can use one of these three methods. .. code-block:: python # Instantiate the class from io.cobralib import ReadKey Words yaml_reader = ReadKeyWords("test_key_words.yaml") yaml_data = yaml_reader.read_full_yaml() json_reader = ReadKeyWords("test_key_words.json") json_data = json_reader.read_full.json() xml_reader = ReadKeyWords("test_key_words.xml") xml_data = xml_reader.read_full_xml() """ def __init__(self, file_name: str, print_lines: int = 50): # Verify file exists) if not os.path.isfile(file_name): raise FileNotFoundError(f"FATAL ERROR: {file_name} does not exist") # Instantiate inherited classes ReadYAML.__init__(self, file_name) ReadJSON.__init__(self, file_name) ReadXML.__init__(self, file_name) # Read in data self._file_name = file_name self.__lines = self._read_lines() self.print_lines = print_lines # ========================================================================================== # PRIVATE-LIKE methods def _read_lines(self): """ This private method will read in all lines from the text file """ with open(self._file_name) as file: lines = [line.strip() for line in file] return lines # ------------------------------------------------------------------------------------------ def __str__(self): """ This private method determines how many of the lines are to be printed to screen and pre-formats the data for printing. """ num_lines = min(self.print_lines, len(self.__lines)) return "\n".join(self.__lines[:num_lines])
# ========================================================================================== # ========================================================================================== # READ COLUMNAR DATA
[docs]def read_csv_columns_by_headers( file_name: str, headers: dict[str, type], skip: int = 0 ) -> pd.DataFrame: """ :param file_name: The file name to include path-link :param headers: A dictionary of column names and their data types. types are limited to ``numpy.int64``, ``numpy.float64``, and ``str`` :param skip: The number of lines to be skipped before reading data :return df: A pandas dataframe containing all relevant information :raises FileNotFoundError: If the file is found to not exist This function assumes the file has a comma (i.e. ,) delimiter, if it does not, then it is not a true .csv file and should be transformed to a text function and read by the read_text_columns_by_headers function. Assume we have a .csv file titled ``test.csv`` with the following format. .. list-table:: test.csv :widths: 6 10 6 6 :header-rows: 1 * - ID, - Inventory, - Weight_per, - Number * - 1, - Shoes, - 1.5, - 5 * - 2, - t-shirt, - 1.8, - 3, * - 3, - coffee, - 2.1, - 15 * - 4, - books, - 3.2, - 48 This file can be read via the following command .. code-block:: python from cobralib.io import read_csv_columns_by_headers > file_name = 'test.csv' > headers = {'ID': int, 'Inventory': str, 'Weight_per': float. 'Number': int} > df = read_csv_columns_by_headers(file_name, headers) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 This function can also use the `skip` attributed read data when the headers are not on the first line. For instance, assume the following csv file; .. list-table:: test1.csv :widths: 16 8 5 5 :header-rows: 0 * - This line is used to provide metadata for the csv file - - - * - This line is as well - - - * - ID, - Inventory, - Weight_per, - Number * - 1, - Shoes, - 1.5, - 5 * - 2, - t-shirt, - 1.8, - 3, * - 3, - coffee, - 2.1, - 15 * - 4, - books, - 3.2, - 48 This file can be read via the following command .. code-block:: python from cobralib.io import read_csv_columns_by_headers > file_name = 'test1.csv' > headers = {'ID': int, 'Inventory': str, 'Weight_per': float, 'Number': int} > df = read_csv_columns_by_headers(file_name, headers, skip=2) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 """ if not os.path.isfile(file_name): raise FileNotFoundError(f"File '{file_name}' not found") head = list(headers.keys()) df = pd.read_csv(file_name, usecols=head, dtype=headers, skiprows=skip) return df
# ----------------------------------------------------------------------------
[docs]def read_csv_columns_by_index( file_name: str, headers: dict[int, type], col_names: list[str], skip: int = 0, ) -> pd.DataFrame: """ :param file_name: The file name to include path-link :param headers: A dictionary of column index and their data types. types are limited to ``numpy.int64``, ``numpy.float64``, and ``str`` :param col_names: A list containing the names to be given to each column :param skip: The number of lines to be skipped before reading data :return df: A pandas dataframe containing all relevant information :raises FileNotFoundError: If the file is found to not exist This function assumes the file has a comma (i.e. ,) delimiter, if it does not, then it is not a true .csv file and should be transformed to a text function and read by the xx function. Assume we have a .csv file titled ``test.csv`` with the following format. .. list-table:: test.csv :widths: 6 10 6 6 :header-rows: 0 * - 1, - Shoes, - 1.5, - 5 * - 2, - t-shirt, - 1.8, - 3, * - 3, - coffee, - 2.1, - 15 * - 4, - books, - 3.2, - 48 This file can be read via the following command .. code-block:: python from cobralib.io import read_csv_columns_by_index > file_name = 'test.csv' > headers = {0: int, 1: str, 2: float, 3: int} > names = ['ID', 'Inventory', 'Weight_per', 'Number'] > df = read_csv_columns_by_index(file_name, headers, names) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 This function can also use the `skip` attributed read data when the headers are not on the first line. For instance, assume the following csv file; .. list-table:: test1.csv :widths: 16 8 5 5 :header-rows: 0 * - This line is used to provide metadata for the csv file - - - * - This line is as well - - - * - 1, - Shoes, - 1.5, - 5 * - 2, - t-shirt, - 1.8, - 3, * - 3, - coffee, - 2.1, - 15 * - 4, - books, - 3.2, - 48 This file can be read via the following command .. code-block:: python from cobralib.io import read_csv_columns_by_index > file_name = 'test1.csv' > headers = {0: int, 1: str, 2: float, 3: int} > names = ['ID', 'Inventory', 'Weight_per', 'Number'] > df = read_csv_columns_by_index(file_name, headers, names, skip=2) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 """ if not os.path.isfile(file_name): raise FileNotFoundError(f"File '{file_name}' not found") col_index = list(headers.keys()) df = pd.read_csv( file_name, usecols=col_index, names=col_names, dtype=headers, skiprows=skip ) return df
# ------------------------------------------------------------------------------------------
[docs]def read_text_columns_by_headers( file_name: str, headers: dict[str, type], skip: int = 0, delimiter=r"\s+", ) -> pd.DataFrame: """ :param file_name: The file name to include path-link :param headers: A dictionary of column names and their data types. types are limited to ``numpy.int64``, ``numpy.float64``, and ``str`` :param skip: The number of lines to be skipped before reading data :param delimiter: The type of delimiter separating data in the text file. Defaulted to space delimited, where a space is one or more white spaces. This function can use any delimiter, to include a comma separation; however, a comma delimiter should be a .csv file extension. :return df: A pandas dataframe containing all relevant information :raises FileNotFoundError: If the file is found to not exist This function assumes the file has a space delimiter, if Assume we have a .csv file titled ``test.txt`` with the following format. .. list-table:: test.txt :widths: 6 10 6 6 :header-rows: 1 * - ID - Inventory - Weight_per - Number * - 1 - Shoes - 1.5 - 5 * - 2 - t-shirt - 1.8 - 3 * - 3 - coffee - 2.1 - 15 * - 4 - books - 3.2 - 48 This file can be read via the following command .. code-block:: python from cobralib.io import read_text_columns_by_headers > file_name = 'test.txt' > headers = {'ID': int, 'Inventory': str, 'Weight_per': float, 'Number': int} > df = read_text_columns_by_headers(file_name, headers) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 This function can also use the `skip` attributed read data when the headers are not on the first line. For instance, assume the following csv file; .. list-table:: test.txt :widths: 16 8 5 5 :header-rows: 0 * - This line is used to provide metadata for the csv file - - - * - This line is as well - - - * - ID - Inventory - Weight_per - Number * - 1 - Shoes - 1.5 - 5 * - 2 - t-shirt - 1.8 - 3 * - 3 - coffee - 2.1 - 15 * - 4 - books - 3.2 - 48 This file can be read via the following command .. code-block:: python from cobralib.io import read_text_columns_by_headers > file_name = 'test.txt' > headers = {'ID': int, 'Inventory': str, 'Weight_per': float, 'Number': int} > df = read_text_columns_by_headers(file_name, headers, skip=2) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 """ if not os.path.isfile(file_name): raise FileNotFoundError(f"File '{file_name}' not found") head = list(headers.keys()) df = pd.read_csv(file_name, usecols=head, dtype=headers, skiprows=skip, sep=delimiter) return df
# --------------------------------------------------------------------------------
[docs]def read_text_columns_by_index( file_name: str, headers: dict[int, type], col_names: list[str], skip: int = 0, delimiter=r"\s+", ) -> pd.DataFrame: """ :param file_name: The file name to include path-link :param headers: A dictionary of column index` and their data types. types are limited to ``numpy.int64``, ``numpy.float64``, and ``str`` :param col_names: A list containing the names to be given to each column :param skip: The number of lines to be skipped before reading data :param delimiter: The type of delimiter separating data in the text file. Defaulted to space delimited, where a space is one or more white spaces. This function can use any delimiter, to include a comma separation; however, a comma delimiter should be a .csv file extension. :return df: A pandas dataframe containing all relevant information :raises FileNotFoundError: If the file is found to not exist Assume we have a .txt file titled ``test.txt`` with the following format. .. list-table:: test.txt :widths: 6 10 6 6 :header-rows: 0 * - 1 - Shoes - 1.5 - 5 * - 2 - t-shirt - 1.8 - 3 * - 3 - coffee - 2.1 - 15 * - 4 - books - 3.2 - 48 This file can be read via the following command .. code-block:: python from cobralib.io import read_text_columns_by_index > file_name = 'test.txt' > headers = {0: int, 1: str, 2: float, 3: int} > names = [ headers = {'ID', 'Inventory', 'Weight_per', 'Number'] > df = read_text_columns_by_index(file_name, headers, names) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 This function can also use the `skip` attributed read data when the headers are not on the first line. For instance, assume the following csv file; .. list-table:: test.txt :widths: 16 8 5 5 :header-rows: 0 * - This line is used to provide metadata for the csv file - - - * - This line is as well - - - * - ID - Inventory - Weight_per - Number * - 1 - Shoes - 1.5 - 5 * - 2 - t-shirt - 1.8 - 3 * - 3 - coffee - 2.1 - 15 * - 4 - books - 3.2 - 48 This file can be read via the following command .. code-block:: python from cobralib.io import read_text_columns_by_index > file_name = 'test.txt' > headers = {0: int, 1: str, 2: float, 3: int} > names = ['ID', 'Inventory', 'Weight_per', 'Number'] > df = read_text_columns_by_index(file_name, headers, names, skip=2) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 """ if not os.path.isfile(file_name): raise FileNotFoundError(f"File '{file_name}' not found") head = list(headers.keys()) df = pd.read_csv( file_name, usecols=head, names=col_names, dtype=headers, skiprows=skip, sep=delimiter, ) return df
# ------------------------------------------------------------------------------------------
[docs]def read_excel_columns_by_headers( file_name: str, tab: str, headers: dict[str, type], skip: int = 0 ) -> pd.DataFrame: """ :param file_name: The file name to include path-link. Must be an .xls file format. This code will **not** read .xlsx :param tab: The tab or sheet name that data will be read from :param headers: A dictionary of column names and their data types. types are limited to ``numpy.int64``, ``numpy.float64``, and ``str`` :param skip: The number of lines to be skipped before reading data :return df: A pandas dataframe containing all relevant information :raises FileNotFoundError: If the file is found to not exist Assume we have a .xls file titled ``test.xls`` with the following format in a tab titled ``primary``. .. list-table:: test.xls :widths: 6 10 6 6 :header-rows: 1 * - ID - Inventory - Weight_per - Number * - 1 - Shoes - 1.5 - 5 * - 2 - t-shirt - 1.8 - 3 * - 3 - coffee - 2.1 - 15 * - 4 - books - 3.2 - 48 This file can be read via the following command .. code-block:: python > file_name = 'test.xls' > tab = "primary" > headers = {'ID': int, 'Inventory': str, 'Weight_per': float, 'Number': int} > df = read_excel_columns_by_headers(file_name, tab, headers) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 This function can also use the `skip` attributed read data when the headers are not on the first line. For instance, assume the following csv file; .. list-table:: test.xls :widths: 16 8 5 5 :header-rows: 0 * - This line is used to provide metadata for the csv file - - - * - This line is as well - - - * - ID - Inventory - Weight_per - Number * - 1 - Shoes - 1.5 - 5 * - 2 - t-shirt - 1.8 - 3 * - 3 - coffee - 2.1 - 15 * - 4 - books - 3.2 - 48 This file can be read via the following command .. code-block:: python from cobralib.io import read_excel_columns_by_headers > file_name = 'test.xls' > tab = "primary" > headers = ['ID': int, 'Inventory': str, 'Weight_per': float, 'Number': int] > df = read_excel_columns_by_headers(file_name, tab, headers, skip=2) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 """ if not os.path.isfile(file_name): raise FileNotFoundError(f"File '{file_name}' not found") head = list(headers.keys()) df = pd.read_excel( file_name, sheet_name=tab, usecols=head, dtype=headers, skiprows=skip, engine="openpyxl", ) return df
# ----------------------------------------------------------------------------
[docs]def read_excel_columns_by_index( file_name: str, tab: str, col_index: dict[int, str], col_names: list[str], skip: int = 0, ) -> pd.DataFrame: """ :param file_name: The file name to include path-link. Must be an .xls file format. This code will **not** read .xlsx :param tab: The tab or sheet name that data will be read from :param col_index: A dictionary of column index` and their data types. types are limited to ``numpy.int64``, ``numpy.float64``, and ``str`` :param col_names: A list containing the names to be given to each column :param skip: The number of lines to be skipped before reading data :return df: A pandas dataframe containing all relevant information :raises FileNotFoundError: If the file is found to not exist Assume we have a .txt file titled ``test.xls`` with the following format. .. list-table:: test.xls :widths: 6 10 6 6 :header-rows: 0 * - 1 - Shoes - 1.5 - 5 * - 2 - t-shirt - 1.8 - 3 * - 3 - coffee - 2.1 - 15 * - 4 - books - 3.2 - 48 This file can be read via the following command .. code-block:: python from cobralib.io import read_excel_columns_by_index > file_name = 'test.xls' > tab = 'primary' > headers = {0: int, 1: str, 2: float, 3: int} > names = ['ID', 'Inventory', 'Weight_per', 'Number'] > df = read_excel_columns_by_index(file_name, tab, headers, names) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 This function can also use the `skip` attributed read data when the headers are not on the first line. For instance, assume the following csv file; .. list-table:: test.xls :widths: 16 8 5 5 :header-rows: 0 * - This line is used to provide metadata for the csv file - - - * - This line is as well - - - * - ID - Inventory - Weight_per - Number * - 1 - Shoes - 1.5 - 5 * - 2 - t-shirt - 1.8 - 3 * - 3 - coffee - 2.1 - 15 * - 4 - books - 3.2 - 48 This file can be read via the following command .. code-block:: python from cobralib.io import read_excel_columns_by_index > file_name = 'test.xls' > tab = "primary" > headers = {0: int, 1: str, 2: float, 3: int} > names = ['ID', 'Inventory', 'Weight_per', 'Number'] > df = read_excel_columns_by_index(file_name, tab, headers, names, skip=2) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 """ if not os.path.isfile(file_name): raise FileNotFoundError(f"File '{file_name}' not found") head = list(col_index.keys()) df = pd.read_excel( file_name, sheet_name=tab, usecols=head, names=col_names, dtype=col_index, skiprows=skip, header=None, engine="openpyxl", ) return df
# ------------------------------------------------------------------------------------------
[docs]def read_pdf_columns_by_headers( file_name: str, headers: dict[str, type], table_idx: int = 0, page_num: int = 0, skip: int = 0, ) -> pd.DataFrame: """ Read a table from a PDF document and save user-specified columns into a pandas DataFrame. This function will read a pdf table that spans multiple pages. **NOTE:** The pdf document must be a vectorized pdf document and not a scan of another document for this function to work. :param file_name: The file name to include the path-link to the PDF file. :param headers: A dictionary of column names and their data types. Data types are limited to ``int``, ``float``, and ``str``. :param table_idx: Index of the table to extract from the page (default: 0). :param page_num: Page number from which to extract the table (default: 0). :param skip: The number of lines to be skipped before reading data :return df: A pandas DataFrame containing the specified columns from the table. :raises FileNotFoundError: If the PDF file is found to not exist. Example usage: .. code-block:: python from cobralib.io import read_pdf_columns_by_headers > file_name = 'test.pdf' > headers = {'ID': int, 'Inventory': str, 'Weight_per': float, 'Number': int} > df = read_pdf_columns_by_headers(file_name, headers, table_idx=0, page_num=1) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 """ if not os.path.isfile(file_name): raise FileNotFoundError(f"File '{file_name}' not found") # Extract tables from the specified page of the PDF using pdfplumber with pdfplumber.open(file_name) as pdf: page = pdf.pages[page_num] table = page.extract_tables() if table_idx >= len(table): raise ValueError(f"Table index {table_idx} out of range.") # Convert the table to a pandas DataFrame df = pd.DataFrame(table[table_idx][1:], columns=table[table_idx][0]) # Skip specified number of rows before reading the header df = df.iloc[skip:] # Filter out columns based on user-specified headers selected_columns = [column for column in headers.keys() if column in df.columns] df = df[selected_columns] # Rename the columns to match the user-specified headers df.columns = list(headers.keys()) # Convert the columns to the specified data types for column, dtype in headers.items(): df[column] = df[column].astype(dtype) return df
# ------------------------------------------------------------------------------------------
[docs]def read_pdf_columns_by_index( file_name: str, headers: dict[int, type], col_names: list[str], table_idx: int = 0, skip_rows: int = 0, page_num: int = 0, ) -> pd.DataFrame: """ Read a table from a PDF document and save user-specified columns into a pandas DataFrame based on their column index. This function will read a pdf table that spans multiple pages. **NOTE:** The pdf document must be a vectorized pdf document and not a scan of another document for this function to work. :param file_name: The file name to include the path-link to the PDF file. :param headers: A dictionary of column index and their data types. Data types are limited to ``int``, ``float``, and ``str``. :param col_names: A list containing the names to be given to each column. :param table_idx: Index of the table to extract from the page (default: 0). :param skip_rows: Number of rows to skip before reading the header row (default: 0). :param page_num: Page number from which to extract the table (default: 0). :return df: A pandas DataFrame containing the specified columns from the table. :raises FileNotFoundError: If the PDF file is found to not exist. Example usage: .. code-block:: python from cobralib.io import read_pdf_columns_by_index > file_name = 'test.pdf' > headers = {0: int, 1: str, 2: float, 3: int} > col_names = ['ID', 'Inventory', 'Weight_per', 'Number'] # Column names > df = read_pdf_columns_by_index(file_name, headers, col_names, table_idx=0, skip_rows=2, page_num=1) > print(df) ID Inventory Weight_per Number 0 1 shoes 1.5 5 1 2 t-shirt 1.8 3 2 3 coffee 2.1 15 3 4 books 3.2 40 """ if not os.path.isfile(file_name): raise FileNotFoundError(f"File '{file_name}' not found") # Extract tables from the specified page of the PDF using pdfplumber with pdfplumber.open(file_name) as pdf: page = pdf.pages[page_num] table = page.extract_tables() if table_idx >= len(table): raise ValueError(f"Table index {table_idx} out of range.") # Convert the table to a pandas DataFrame df = pd.DataFrame(table[table_idx][1:], columns=table[table_idx][0]) # Skip specified number of rows before reading the header df = df.iloc[skip_rows:] # Filter out columns based on user-specified column indices selected_columns = [ col_idx for col_idx in headers.keys() if col_idx < len(df.columns) ] df = df.iloc[:, selected_columns] # Rename the columns with user-specified column names df.columns = col_names[: len(selected_columns)] dat_type = list(headers.values()) name_dict = dict(zip(col_names, dat_type)) # Convert the columns to the specified data types for column, dtype in name_dict.items(): df[column] = df[column].astype(dtype) return df
# ========================================================================================== # ========================================================================================== # READ AND WRITE TO YAML
[docs]def write_yaml_file(file_path: str, data: dict, append: bool = False) -> None: """ Write or append data to a YAML file. :param file_path: The path of the YAML file :param data: The data to be written or appended as a dictionary :param append: True to append data to the file, False to overwrite the file or create a new one (default: False) :raises FileNotFoundError: If the file does not exist in append mode .. code-block:: python from corbalib.io import write_yaml_file dict_file = {'sports' : ['soccer', 'football', 'basketball', 'cricket', 'hockey', 'table tennis']}, {'countries' : ['Pakistan', 'USA', 'India', 'China', 'Germany', 'France', 'Spain']} # Create new yaml file write_yaml_file('new_file.yaml', data, dict_file, append=False) This will create a file titled new_file.yaml with the following contents .. literalinclude:: ../../../data/test/output.yaml :language: text """ mode = "a" if append else "w" if append and not os.path.exists(file_path): raise FileNotFoundError(f"File '{file_path}' not found.") try: with open(file_path, mode) as file: if append: file.write("---\n") # Add YAML document separator yaml.safe_dump(data, file) except OSError as e: print(f"Error writing to file: {e}")
# ========================================================================================== # ==========================================================================================
[docs]class Logger: """ Custom logging class that writes messages to both console and log file. :param filename: The name of the file to write logs to. :param console_level: The minimum logging level for the console. Should be one of: 'NOTSET', 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'. :param file_level: The minimum logging level for the log file. Should be one of: 'NOTSET', 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'. :param max_lines: The maximum number of lines in the log file. When exceeded, the oldest entries are deleted. :raises ValueError: If `console_level` or `file_level` are not valid logging levels. :raises IOError: If an I/O error occurs when opening the file. **Example usage:** .. code-block:: python # create logger with filename='my_log.log', console_level='INFO', # file_level='DEBUG', and max_lines=100 logger = Logger('my_log.log', 'INFO', 'DEBUG', 100) # log a DEBUG message logger.log('DEBUG', 'This is a debug message') # log an INFO message logger.log('INFO', 'This is an info message') """ def __init__(self, filename, console_level, file_level, max_lines): self.filename = filename self.max_lines = max_lines # Creating logger self.logger = logging.getLogger(filename) self.logger.setLevel(logging.DEBUG) # Creating console handler and setting its level ch = logging.StreamHandler() ch.setLevel(self._str_to_log_level(console_level)) # Creating file handler and setting its level fh = logging.handlers.RotatingFileHandler(filename, backupCount=1) fh.setLevel(self._str_to_log_level(file_level)) # Creating formatter fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" formatter = logging.Formatter(fmt) # Setting formatter for ch and fh ch.setFormatter(formatter) fh.setFormatter(formatter) # Adding ch and fh to logger self.logger.addHandler(ch) self.logger.addHandler(fh) # ------------------------------------------------------------------------------------------ def _str_to_log_level(self, level): """ Convert string representation of logging level to corresponding logging module constants. :param level: The string representation of the logging level. :return: Corresponding logging level. """ levels = { "NOTSET": logging.NOTSET, "DEBUG": logging.DEBUG, "INFO": logging.INFO, "WARNING": logging.WARNING, "ERROR": logging.ERROR, "CRITICAL": logging.CRITICAL, } return levels.get(level, logging.NOTSET) # ------------------------------------------------------------------------------------------
[docs] def log(self, level, msg): """ Write a log entry. :param level: The level of the log entry. Should be one of: 'NOTSET', 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'. :param msg: The message to be logged. :raises ValueError: If `level` is not a valid logging level. """ self.logger.log(self._str_to_log_level(level), msg) self._trim_log_file()
def _trim_log_file(self): """ Trims the log file to the last `max_lines` entries. :raises IOError: If an I/O error occurs when trying to trim the file. """ try: with open(self.filename, "r+") as f: lines = deque(f, self.max_lines) f.seek(0) f.writelines(lines) f.truncate() except OSError: self.logger.exception("Error while trimming log file")
# ========================================================================================== # ========================================================================================== # eof