# Import necessary packages here
import json
import logging
import logging.handlers
import os
import re
import xml.etree.ElementTree as ET
from collections import deque
from typing import Any, Union
import pandas as pd
import pdfplumber
import xmltodict
import yaml
# ==========================================================================================
# ==========================================================================================
# File: io.py
# Date: July 09, 2023
# Author: Jonathan A. Webb
# Purpose: This file contains classes and functions that can be used to read and write
# to files
# ==========================================================================================
# ==========================================================================================
# Insert Code here
[docs]class ReadYAML:
"""
:param file_name: The name and path length for the file with the yaml-like
format
:raises FileNotFoundError: If the file does not exist.
This class can be used to read a file woith a YAML-like format. This class is
tailoered to read basic YAML files, but with looser requirements on how
key words are formatted, and stricter requirements on data typing. The methods
within this class can be used to read scalar variables from key-variable pairs,
lists, and flat dictionaries. This class also enforces type casting for all
variables read into memory. This class is more meory efficient than using
PyYAML, since it only reads the requested lines to memory.
All code examples described in the documentation for this class reference
the read_yaml.yaml file shown below.
.. literalinclude:: ../../../data/test/read_yaml.yaml
:language: text
"""
def __init__(self, file_name: str):
if not os.path.isfile(file_name):
raise FileNotFoundError(f"FATAL ERROR: {file_name} does not exist")
self._file_name = file_name
self.__yamllines = self._read_yamllines()
# ------------------------------------------------------------------------------------------
[docs] def read_key_value(
self, keyword: str, data_type: type, document_index: int = 0
) -> Any:
"""
:param keyword: The keyword associated with the value to be read in. Unlike a
pure YAML file this value does not have to end with a :
symbol
:param data_type: The data type of the value to be read in
:param document_index: The number of the yaml document in the yaml file.
:return value: The value associated with a keyword
:raise ValueError: If the value can not be cast to the user defined type
This method can be used to read a key-value pair from a yaml or yaml-like
file. This method will rcognize the >, ^, and | symbols that symbolize
strings that either start on the next line, or multiline strings.
Example 1
---------
An example of a python code to read an float value from the
1st yaml document.
.. code-block:: python
from cobralib.io import ReadYAML
reader = ReadYAML('read_yaml.yaml')
value = reader.read_key_value('key:', float, 0)
print(value)
>> 4.387
Example 2
---------
An example to read a multiline string value from the second yaml document
in the file
.. code-block:: python
from cobralib.io import ReadYAML
reader = ReadYAML('read_yaml.yaml')
value = reader.read_key_value('Multi Sentence:', str, 1)
new_value = reader.read_key_value('Second Mult Sentence:', str, 1)
print(value)
print(new_value)
.. code-block:: bash
>> This is a multiline sentence,
there is no reason to worry!
>> This is a multiline sentence, there is no reason to worry!
Example 3
---------
An example that shows the different way boolean values can be read into
memory. A value of True, on, or yes will equate to True and values of
False, off, no will equate to False. The values in the yaml-like
file are case insensitive.
.. code-block:: python
from cobralib.io import ReadYAML
reader = ReadYAML('read_yaml.yaml')
true_value = reader.read_key_value('bool test1:', bool, 1)
yes_value = reader.read_key_value('bool test4:', bool, 1)
on_value = reader.read_key_value('bool test5:', bool, 1)
false_value = reader.read_key_value('bool test2:', bool, 1)
no_value = reader.read_key_value('bool test3:', bool, 1)
off_value = reader.read_key_value('bool test6:', bool, 1)
.. code-block:: bash
>> True
>> True
>> True
>> False
>> False
>> False
"""
yaml_docs = self._read_yaml_documents()
self._check_document_length(document_index, yaml_docs)
lines = yaml_docs[document_index].split("\n")
for i, line in enumerate(lines):
stripped_line = line.lstrip()
if stripped_line.startswith(keyword):
keyword_indent = len(line) - len(stripped_line)
value_str = stripped_line[len(keyword) :].strip()
return self._parse_value(
value_str, lines[i + 1 :], keyword_indent, data_type
)
raise ValueError(f"Keyword '{keyword}' not found in the specified document")
# ------------------------------------------------------------------------------------------
[docs] def read_yaml_list(
self, keyword: str, data_type: type, document_index: int = 0
) -> list[Any]:
"""
:param keyword: The keyword associated with the value to be read in. Unlike
a pure YAML file, this value does not have to end with a :
symbol.
:param data_type: The data type of the value to be read in
:param document_index: The number of the yaml document in the yaml file.
:return value: The list associated with a keyword
:raise ValueError: If the value can not be cast to the user defined type
This method can be used to read a key-value pair from a yaml or yaml-like
file where the value is a list of values. This method will rcognize the
>, ^, and | symbols that symbolize strings that either start on the next
line, or multiline strings.
Example 1
---------
An example of a python code to read a list of integer values from the
1st yaml document.
.. code-block:: python
from cobralib.io import ReadYAML
reader = ReadYAML('read_yaml.yaml')
list_values = reader.read_yaml_list('First List:', int, 0)
print(list_values)
.. code-block:: bash
>> [1.1, 2.2, 3.3, 4.4]
Example 2
---------
This method will also read string values from the list that may use the
^, > or | symbols that signify the string as starting on the next line,
a multi-line string that should be read into one line, or a multiline
string that should be read as a multiline string.
.. code-block:: python
from cobralib.io import ReadYAML
reader = ReadYAML('read_yaml.yaml')
list_values = reader.read_yaml_list('Numbers:', int, 0)
print(list_values)
.. code-block:: text
>> ['Hello World
This is Jon',
'This',
'Is',
'Correct']
"""
yaml_docs = self._read_yaml_documents()
self._check_document_length(document_index, yaml_docs)
lines = yaml_docs[document_index].split("\n")
values = []
is_reading_list = False
keyword_indent = 0
i = 0
while i < len(lines):
line = lines[i]
stripped_line = line.lstrip()
current_indent = len(line) - len(stripped_line)
if stripped_line.startswith(keyword):
keyword_indent = current_indent
is_reading_list = True
# Check for an inline list
rest_of_line = stripped_line[len(keyword) :].strip()
if rest_of_line.startswith("[") and rest_of_line.endswith("]"):
inline_list = rest_of_line[1:-1].split(",")
for x in inline_list:
try:
values.append(data_type(x.strip()))
except ValueError:
raise ValueError("Invalid value")
return values
i += 1 # Move to the next line
continue
if is_reading_list:
if current_indent <= keyword_indent:
break
# Add list items
if stripped_line.startswith("-"):
value_str = stripped_line[1:].strip() # Remove "-" and leading spaces
# Check for special string types
if value_str in ["^", ">", "|"]:
complex_str = value_str
value_str = ""
i += 1
while i < len(lines):
next_line = lines[i]
next_indent = len(next_line) - len(next_line.lstrip())
if next_indent <= current_indent:
i -= (
1 # Step back to let the outer loop process this line
)
break
if complex_str == "^":
value_str = next_line.strip()
break
elif complex_str == "|":
value_str += next_line.strip() + "\n"
elif complex_str == ">":
next_line_content = next_line[current_indent:].lstrip()
value_str += next_line_content + " "
i += 1
if complex_str == ">":
value_str = value_str.rstrip()
try:
values.append(data_type(value_str))
except ValueError:
raise ValueError("Invalid value")
i += 1
msg = f"Keyword '{keyword}' not found or it is "
msg += "not a list in the specified document."
if not is_reading_list:
raise ValueError(msg)
return values
# ------------------------------------------------------------------------------------------
[docs] def read_yaml_dict(
self,
keyword: str,
key_data_type: type,
value_data_type: type,
document_index: int = 0,
) -> dict:
"""
:param keyword: The keyword associated with the value to be read in. Unlike
a pure YAML file, this value does not have to end with a :
symbol.
:param key_data_type: The data type of the key value.
:param value_data_type: The data type of the value to be read in
:param document_index: The number of the yaml document in the yaml file.
:return value: The dictionary associated with a keyword
:raise ValueError: If the value can not be cast to the user defined type
This method can be used to read a key-value pair from a yaml or yaml-like
file where the value is a dictionary of values. This method will recognize
the >, ^, and | symbols that symbolize strings that either start on the next
line, or multiline strings. **NOTE:** This method assumes a flat
(i.e. not nested) dictionary structure.
Example 1
---------
An example of a python code to read a list of integer values from the
1st yaml document.
.. code-block:: python
from cobralib.io import ReadYAML
reader = ReadYAML('read_yaml.yaml')
value = reader.read_yaml_dict('Ages:', 'str', 'int', 1)
print(value)
.. code-block:: text
>> {'Jon': 44. 'Jill': 32, 'Bob': 12}
"""
yaml_docs = self._read_yaml_documents()
self._check_document_length(document_index, yaml_docs)
lines = yaml_docs[document_index].split("\n")
found_dict = {}
is_reading_dict = False
keyword_indent = None
val_str = ""
i = 0
while i < len(lines):
line = lines[i]
stripped_line = line.lstrip()
current_indent = len(line) - len(stripped_line)
if stripped_line.startswith(keyword):
is_reading_dict = True
keyword_indent = current_indent
i += 1 # Move to the next line
continue
if is_reading_dict:
if current_indent <= keyword_indent:
break
# Ensure this line is part of the dict
if ":" in stripped_line:
key_str, value_str = map(str.strip, stripped_line.split(":", 1))
key = self._parse_value(key_str, [], current_indent, key_data_type)
if value_str in ["^", ">", "|"]:
value_str = ""
i += 1
while i < len(lines):
next_line = lines[i]
next_indent = len(next_line) - len(next_line.lstrip())
if next_indent <= current_indent:
i -= 1
break
value_str += next_line + "\n" # Add the line to value_str
i += 1
# Remove trailing newline and pass all lines for parsing
val_str = value_str
value_str = value_str.rstrip("\n")
value = self._parse_value(
value_str,
value_str.split("\n"),
current_indent,
value_data_type,
)
value = self._remove_uniform_indent(value)
else:
value = self._parse_value(
value_str, [], current_indent, value_data_type
)
if val_str in [">", "|"]:
print("YES")
value = self._remove_uniform_indent(value)
found_dict[key] = value
i += 1
msg = f"Keyword '{keyword}' not found or it is not a "
msg += " dictionary in the specified document."
if not is_reading_dict:
raise ValueError(msg)
return found_dict
# ------------------------------------------------------------------------------------------
[docs] def read_full_yaml(self, safe_read: bool = True) -> Any:
"""
Reads the full YAML file and returns it as a PyYAML object.
:params safe_read: Whether to read the file in a safe more or not.
Defaulted to True
:return Any: The full content of the YAML file as a PyYAML object. This method
assumes the possibility of multiple documents in one file. The
result is returned as a list
Unlike other methods in this class, this method will read an entire yaml file
into memory and return a PyYaml object. This is not as memory efficient as the
other methods, but this will make the accessing of data quicker for larger
files. In addition, the user must adhere to the strict rules of YAML
when using this method. The rules for a PyYaml class can be
found at `PyYaml <https://pyyaml.org/wiki/PyYAMLDocumentation>`_.
Example 1
---------
An example of a python code to read a list of integer values from the
1st yaml document.
.. code-block:: python
from cobralib.io import ReadYAML
reader = ReadYAML('read_yaml.yaml')
data = reader.read_full_yaml() # Read in as safe mode
print(data[1]['Ages'])
.. code-block:: text
>> {'Jon': 44. 'Jill': 32, 'Bob': 12}
"""
with open(self._file_name) as file:
if safe_read:
return list(yaml.safe_load_all(file))
else:
return list(yaml.load_all(file))
# ------------------------------------------------------------------------------------------
[docs] def read_yaml_dict_of_list(
self,
keyword: str,
key_data_type: type,
list_data_type: type,
document_index: int = 0,
) -> dict:
"""
:param keyword: The keyword associated with the value to be read in. Unlike
a pure YAML file, this value does not have to end with a :
symbol.
:param key_data_type: The data type of the key value.
:param list_data_type: The data type of the value to be read in
:param document_index: The number of the yaml document in the yaml file.
:return value: The dictionary associated with a keyword
:raise ValueError: If the value can not be cast to the user defined type
This method can be used to read a key-value pair from a yaml or yaml-like
file where the value is a dictionary of lists. This method will recognize
the >, ^, and | symbols that symbolize strings that either start on the next
line, or multiline strings. **NOTE:** This method assumes a flat
(i.e. not nested) dictionary structure.
Example 1
---------
An example of a python code to read a dictionary of integer list values
from the 1st yaml document.
.. code-block:: python
from cobralib.io import ReadYAML
reader = ReadYAML('read_yaml.yaml')
value = reader.read_yaml_dict('Dict List:', 'str', 'int', 0)
print(value)
.. code-block:: text
>> {'One': [1, 2, 3], 'Two': [3, 4, 5], 'Three': [6, 7, 8]}
"""
yaml_docs = self._read_yaml_documents()
self._check_document_length(document_index, yaml_docs)
lines = iter(yaml_docs[document_index].split("\n")) # Convert to an iterator
is_reading_dict = False
keyword_indent = None
current_dict = {}
current_list = None
for line in lines:
stripped_line = line.lstrip()
current_indent = len(line) - len(stripped_line)
if stripped_line.startswith(keyword):
is_reading_dict = True
keyword_indent = current_indent
continue
if is_reading_dict:
if current_indent <= keyword_indent:
break
if ":" in stripped_line:
key, value = map(str.strip, stripped_line.split(":", 1))
key = key_data_type(key)
if value.startswith("[") and value.endswith("]"):
current_list = [
list_data_type(v.strip()) for v in value[1:-1].split(",")
]
current_dict[key] = current_list
current_list = None
else:
current_list = []
current_dict[key] = current_list
elif stripped_line.startswith("-"):
value_str = stripped_line[1:].strip()
complex_str = None
if value_str in ["^", ">", "|"]:
complex_str = value_str
value_str = self._parse_block_scalar(
lines, current_indent, complex_str
)
current_list.append(list_data_type(value_str))
msg = f"Keyword '{keyword}' not found or it is not "
msg += "dictionary of lists in the specified document."
if not is_reading_dict:
raise ValueError(msg)
return current_dict
# ==========================================================================================
# PRIVATE-LIKE methods
def _read_yamllines(self):
"""
This private method will read in all lines from the text file
"""
with open(self._file_name) as file:
lines = [line.rstrip() for line in file]
return lines
# ------------------------------------------------------------------------------------------
def _read_yaml_documents(self):
yaml_docs = list(
filter(lambda x: x.strip(), "\n".join(self.__yamllines).split("---"))
)
return yaml_docs
# ------------------------------------------------------------------------------------------
def _check_document_length(self, document_index, yaml_docs) -> None:
if document_index >= len(yaml_docs) or document_index < 0:
raise ValueError(
f"""Document index {document_index} out of range.
File contains {len(yaml_docs)} documents."""
)
# ------------------------------------------------------------------------------------------
def _calculate_indent(self, line: str):
return len(line) - len(line.lstrip())
# ------------------------------------------------------------------------------------------
def _parse_block_scalar(self, lines: iter, current_indent: int, complex_str: str):
value_str = ""
lines_iter = iter(lines)
while True:
line = next(lines_iter, "").rstrip()
next_indent = self._calculate_indent(line)
if next_indent <= current_indent:
break
line_content = line[next_indent:].lstrip()
if complex_str == "^":
value_str = line_content.strip()
break
elif complex_str == "|":
value_str += line_content + "\n"
elif complex_str == ">":
value_str += line_content + " "
return value_str.rstrip() if complex_str in ["|", ">"] else value_str
# ------------------------------------------------------------------------------------------
def _remove_uniform_indent(self, multi_line_str: str) -> str:
lines = multi_line_str.split("\n")
# Calculate the minimum number of leading white spaces
min_indent = float("inf") # Set to infinity initially
for line in lines:
stripped_line = line.lstrip()
if stripped_line: # Ignore empty lines
indent = len(line) - len(stripped_line)
min_indent = min(min_indent, indent)
# Remove the minimum indent from each line
lines = [line[min_indent:] for line in lines]
return "\n".join(lines)
# ------------------------------------------------------------------------------------------
def _parse_value(
self, value_str: str, subsequent_lines: list, keyword_indent: int, data_type: type
) -> Any:
if data_type == bool:
value_str = value_str.lower()
if value_str.upper() in ["TRUE", "YES", "ON"]:
return True
elif value_str.upper() in ["FALSE", "NO", "OFF"]:
return False
else:
raise ValueError("Invalid boolean value")
if data_type == str and value_str in ["^", ">", "|"]:
value_str = self._parse_block_scalar(
subsequent_lines, keyword_indent, value_str
)
try:
return data_type(value_str)
except ValueError:
raise ValueError("Invalid value")
# ==========================================================================================
# ==========================================================================================
[docs]class ReadJSON:
"""
:param file_name: The name and path length for the file with the json-like
format. While not required, it is recommended that this
file use a .jwc extension.
:raises FileNotFoundError: If the file does not exist.
This class can be used to read a file woith a JSON-like format. This class is
tailoered to read basic JSON files, but with looser requirements on how
key words are formatted, and stricter requirements on data typing. The methods
within this class can be used to read scalar variables from key-variable pairs,
lists, and flat dictionaries. The file containing json data can be a pure
.json file, or it can be mixed with yaml like key value pairs. If the file
is mixed, it is recommended that the file be defined with a .jwc extension.
"""
def __init__(self, file_name: str):
if not os.path.isfile(file_name):
raise FileNotFoundError(f"FATAL ERROR: {file_name} does not exist")
self._file_name = file_name
self.__jsonlines = self._read_jsonlines()
# ------------------------------------------------------------------------------------------
[docs] def read_json(self, keyword: str) -> dict:
"""
Search each line for the specified keyword and read the JSON data
to the right of the keyword until the termination of brackets.
:param keyword: The keyword to search for in each line.
:return: The JSON data as a dictionary.
:raises ValueError: If the keyword is not found or if the JSON data is not valid.
Example 1
---------
This example shows a file that mixes YAML and JSON data types. In order
to delinate the file type that contains mixed data, it is recommended that
the .jwc file format be used; however, it is not required.
.. code-block:: text
Yaml Dict:
- 1
- 2
- 3
Yaml Key: Test String
Yaml Dict:
One: 1.1
Two: 2.2
Three: 3.3
Json Book Data: {"book": "History of the World", "year": 1976}
.. code-block:: python
from cobralib.io import ReadJSON
# Instantiate the class
reader = ReadJSON("test_key_words.jwc")
value = reader.read_json("JSON Book Data:")
print(value)
.. code-block:: text
>> {"book": "History of the World", "year": 1976}
"""
found_keyword = False
json_data = ""
bracket_count = 0
for line in self.__jsonlines:
line = line.strip() # Remove leading and trailing whitespaces
if found_keyword or line.startswith(keyword):
if not found_keyword:
json_data += line.split(keyword, 1)[-1].lstrip()
found_keyword = True
else:
json_data += " " + line # Add a space to ensure proper formatting
bracket_count += line.count("{") - line.count("}")
# If we've found as many closing brackets as opening ones
if bracket_count == 0:
try:
return json.loads(json_data)
except json.JSONDecodeError as e:
raise ValueError(
f"Invalid JSON data for keyword '{keyword}': {e}"
)
if not found_keyword:
raise ValueError(f"Keyword '{keyword}' not found in the file")
else:
raise ValueError(f"Invalid JSON data for keyword '{keyword}'")
# ------------------------------------------------------------------------------------------
[docs] def read_full_json(self, keyword: str = None) -> Union[dict, list]:
"""
Read the entire contents of the file as JSON data.
If a keyword is provided, search for that keyword and return the nested
dictionaries beneath it.
:param keyword: The keyword to search for in the file. If None,
returns the entire JSON data.
:return: The JSON data as a dictionary or list.
:raises ValueError: If the keyword is specified but not found
in the file.
Unlike the read_json method, this method assumes the entire file is
formatted as a .json file. This method will allow a user to read
in the entire contents of the json file as a dictionary, or it
will read in the dictionaries nested under a specific key word.
If you assume the input file titled example.json has the following
format
Example 1
---------
.. code-block:: json
{
"key1": "value1",
"key2": {
"subkey1": "subvalue1",
"subkey2": {
"subsubkey1": "subsubvalue1",
"subsubkey2": "subsubvalue2"
}
}
}
The code to extract data would look like:
.. code-block:: python
from cobralib.io import ReadJSON
reader = ReadJSON("example.json")
value = reader.read_full_json()
print(value)
new_value = reader.read_full_json("subkey2")
print(new_value)
.. code-block:: text
>> {
"key1": "value1",
"key2": {
"subkey1": "subvalue1",
"subkey2": {
"subsubkey1": "subsubvalue1",
"subsubkey2": "subsubvalue2"
}
}
}
>> {"subsubkey1": "subsubvalue1", "subsubkey2": "subsubvalue2"}
"""
json_data = json.loads("\n".join(self.__jsonlines))
if keyword is None:
return json_data
def find_nested_dictionaries(data, keyword):
if isinstance(data, dict):
if keyword in data:
return data[keyword]
for value in data.values():
result = find_nested_dictionaries(value, keyword)
if result is not None:
return result
elif isinstance(data, list):
for item in data:
result = find_nested_dictionaries(item, keyword)
if result is not None:
return result
return None
result = find_nested_dictionaries(json_data, keyword)
if result is not None:
return result
else:
raise ValueError(f"Keyword '{keyword}' not found in the JSON data")
# ==========================================================================================
# PRIVATE-LIKE METHODS
def _read_jsonlines(self):
"""
This private method will read in all lines from the text file
"""
with open(self._file_name) as file:
lines = [line.rstrip() for line in file]
return lines
# ==========================================================================================
# ==========================================================================================
[docs]class ReadXML:
"""
:param file_name: The name and path length for the file with the xml-like
format. While not required, it is recommended that this
file either be an .xml or .jwc file.
:raises FileNotFoundError: If the file does not exist.
This class can be used to read a file woith a XML-like format. This class is
tailoered to read basic XML files, but with looser requirements on how
key words are formatted, and stricter requirements on data typing. The methods
within this class can be used to read scalar variables from key-variable pairs,
lists, and flat dictionaries. The file containing the XML data can contain
traditional XML data or yaml-like key value pairs. If the file is mixed,
it is recommended that the file be defined with a .jwc extension.
"""
def __init__(self, file_name: str):
if not os.path.isfile(file_name):
raise FileNotFoundError(f"FATAL ERROR: {file_name} does not exist")
self._file_name = file_name
self.__xmllines = self._read_xml_lines()
# ------------------------------------------------------------------------------------------
[docs] def read_xml(self, keyword: str) -> dict:
"""
Search each line for the specified keyword and read the XML data
to the right of the keyword until the termination of tags.
:param keyword: The keyword to search for in each line.
:return: The XML data as a dictionary.
:raises ValueError: If the keyword is not found or if the XML data is not valid.
Example 1
---------
.. code-block:: text
Yaml Dict:
- 1
- 2
- 3
Yaml Key: Test String
Yaml Dict:
One: 1.1
Two: 2.2
Three: 3.3
XML Book Data: <root>
<book>"History of the World"</book>
<Year>1976</Year>
</root>
.. code-block:: python
from cobralib.io import ReadXML
reader = ReadXML("example.jwc")
value = reader.read_xml("XML Book Data")
print(value)
.. code-block:: text
>> {"book": "History of the World", "year": 1976}
"""
found_keyword = False
xml_data = ""
collect_lines = False
root_tag = None # Root tag of the XML data
for line in self.__xmllines:
if line.startswith(keyword):
found_keyword = True
collect_lines = True # Start collecting lines
remaining_line = line.split(keyword)[-1].strip()
xml_data += remaining_line
# Try to find the root tag from this line
match = re.search("<([^/> ]+)", remaining_line)
if match:
root_tag = match.group(1)
elif collect_lines:
xml_data += line.strip() # Add line to xml_data
# If root_tag is still None, try to find it from this line
if root_tag is None:
match = re.search("<([^/> ]+)", line)
if match:
root_tag = match.group(1)
# Stop collecting lines if we find the closing root tag
if root_tag is not None and f"</{root_tag}>" in line:
break
if not found_keyword:
raise ValueError(f"Keyword '{keyword}' not found in the file")
if xml_data.startswith("<") and xml_data.endswith(">"):
try:
return xmltodict.parse(xml_data)
except Exception: # Catch any XML parsing errors
raise ValueError(f"Invalid XML data for keyword '{keyword}'")
else:
raise ValueError(f"Invalid XML data for keyword '{keyword}'")
# ------------------------------------------------------------------------------------------
[docs] def read_full_xml(self, keyword: str = None):
"""
Read the XML data. If a keyword is provided, search for the specified
keyword in the XML data and return the nested elements beneath it.
If no keyword is provided, return the full XML data.
:param keyword: The keyword to search for in the XML data.
:return: The XML data as a dictionary object or the nested elements
as an ElementTree object if a keyword is provided.
:raises ValueError: If the keyword is specified but not found in the XML data.
If you assume the input file titled example.xml has the following format:
Example 1
---------
.. code-block:: xml
<root>
<key1>value1</key1>
<key2>
<subkey1>subvalue1</subkey1>
<subkey2>
<subsubkey1>subsubvalue1</subsubvalue1>
<subsubkey2>subsubvalue2</subsubvalue2>
</subkey2>
</key2>
</root>
The code to extract data would look like:
.. code-block:: python
from cobralib.io import ReadXML
reader = ReadXML("example.xml")
value = reader.read_full_xml()
print(value)
>> {
"root": {
"key1": "value1",
"key2": {
"subkey1": "subvalue1",
"subkey2": {
"subsubkey1": "subsubvalue1",
"subsubkey2": "subsubvalue2"
}
}
}
}
new_value = reader.read_full_xml("subkey2")
print(new_value)
.. code-block:: text
>> {
"subkey1": "subvalue1",
"subkey2": {
"subsubkey1": "subsubvalue1",
"subsubkey2": "subsubvalue2"
}
}
"""
tree = ET.parse(self._file_name)
root = tree.getroot()
if keyword is None:
xml_string = ET.tostring(root, encoding="utf-8").decode()
return xmltodict.parse(xml_string)
else:
elements = root.findall(f".//{keyword}")
if elements:
xml_string = ET.tostring(elements[0], encoding="utf-8").decode()
return xmltodict.parse(xml_string)
else:
raise ValueError(f"Keyword '{keyword}' not found in the XML data")
# ==========================================================================================
# PRIVATE-LIKE METHODS
def _read_xml_lines(self):
"""
This private method will read in all lines from the text file
"""
with open(self._file_name) as file:
lines = [line.rstrip() for line in file]
return lines
# ==========================================================================================
# ==========================================================================================
[docs]class ReadKeyWords(ReadYAML, ReadJSON, ReadXML):
"""
This class is a container for the ReadYAML, ReadJSON, and ReadXML classes. This
class is developed specifically to read .jwc file types, which can mix JSON,
XML, and YAML formats. Thsi file can be used to read a straight XML, JSON,
or YAML file.
:param file_name: The file name to be read including the path length
:param print_lines: The number of lines to be printed to the screen if the
user prints an instance of the class. Defaulted to 50
:raises FileNotFoundError: If the file does not exist
Example File
------------
.. code-block:: text
---
# First document in file
Float Value: 4.387
Double Value: 1.11111187
integer: 6
String: Hello
Float List: [1.1 2.2 3.3 4.4]
Yaml Block List:
- 1
- 2
- 3
- 4
Yaml Dict:
First Key: 3.3
Second Key: 4.4
Third Key: 5.5
Fourth Key: 6.6
String List Hello World How are you
JSON Data: {"book": "History of the World, "Year": 1976}
XML Data: <root>
<book>"History of the World"</book>
<Year>1976</Year>
</root>
---
# Second document in file
# Notice that a : character is not required
Another Int 3
Instantiation Example
---------------------
.. code-block:: python
# Instantiate the class
from io.cobralib import ReadKey Words
reader = ReadKeyWords("test_key_words.jwc", print_lines=2)
# Print the instance, displaying 2 lines
print(reader)
.. code-block:: bash
>> Float Value: 4.387 # Comment line not to be read
>> Double Value: 1.11111187 # Comment line not to be read
The user can also adjust the print_lines attribute after instantiation
if they wish to change the number of printed lines
Read Scalar Values
------------------
This class can be used to read in key value pairs.
.. code-block:: python
# Instantiate the class
from io.cobralib import ReadKey Words
reader = ReadKeyWords("test_key_words.jwc")
int_value = reader.read_key_value("integer:", int)
double_value = reader.read_key_value("Double Value:", np.float64)
# Read from second document in file
second_doc = reader.read_key_value("Another Int", int, 1)
print("Integer Value: ", int_value)
print(type)
print("Double Value: ", double_value)
print(type)
print(second_doc)
.. code-block:: bash
>> Integer Value: 6
>> int
>> Double Value: 1.11111187
>> np.float64
>> 3
Read List Values
----------------
This class can be used to read in lists stored inline or in block
formats
.. code-block:: python
# Instantiate the class
from io.cobralib import ReadKey Words
reader = ReadKeyWords("test_key_words.jwc")
inline_list = reader.read_key_value("Float List:", float)
block_list = reader.read_key_value("Yaml Block List:", int)
print("Inline List: ", inline_list)
print("Block List: ", block_list)
.. code-block:: bash
>> Inline List: [ 1.1, 2.2, 3.3, 4.4 ]
>> Block List: [ 1, 2, 3, 4 ]
Read JSON and XML
-----------------
This class can be used to read JSON and XML data associated with key words
.. code-block:: python
# Instantiate the class
from io.cobralib import ReadKey Words
reader = ReadKeyWords("test_key_words.jwc")
json_data = reader.read_json("JSON Data:")
xml_data = reader.read_xml("XML Data:")
print("JSON Data: ", json_data)
print("XML Data: ", xml_data)
.. code-block:: bash
>> JSON Data: {"book": "History of the World", "Year", 1976}
>> XML Data: {"book": "History of the World", "Year", 1976}
Read YAML Dictionaries
----------------------
This class can be used to read dictionaries encoded in YAML formats. Unlike
JSON and XML, dictionaries read in from a YAML format must be flat
(i.e. no nested dictionaries) and of a uniform data type.
.. code-block:: python
# Instantiate the class
from io.cobralib import ReadKey Words
reader = ReadKeyWords("test_key_words.jwc")
yaml_dict = reader.read_yaml_dict("Yaml Dict:", str, float)
print("YAML Dictionary: ", yaml_dict)
.. code-block:: bash
>> YAML Dictionary: {"First Key": 3.3, "Second Key": 4.4,
"Third Key": 5.5, "Fourth Key": 6.6}
**Note:** In order to read in a ditionary of lists, use the ``read_yaml_dict_of_list``
method.
YAML, JSON, and XML Files
-------------------------
If you wish to read a .yaml, .josn, or .xml file that does not contain
mixed data, you can use one of these three methods.
.. code-block:: python
# Instantiate the class
from io.cobralib import ReadKey Words
yaml_reader = ReadKeyWords("test_key_words.yaml")
yaml_data = yaml_reader.read_full_yaml()
json_reader = ReadKeyWords("test_key_words.json")
json_data = json_reader.read_full.json()
xml_reader = ReadKeyWords("test_key_words.xml")
xml_data = xml_reader.read_full_xml()
"""
def __init__(self, file_name: str, print_lines: int = 50):
# Verify file exists)
if not os.path.isfile(file_name):
raise FileNotFoundError(f"FATAL ERROR: {file_name} does not exist")
# Instantiate inherited classes
ReadYAML.__init__(self, file_name)
ReadJSON.__init__(self, file_name)
ReadXML.__init__(self, file_name)
# Read in data
self._file_name = file_name
self.__lines = self._read_lines()
self.print_lines = print_lines
# ==========================================================================================
# PRIVATE-LIKE methods
def _read_lines(self):
"""
This private method will read in all lines from the text file
"""
with open(self._file_name) as file:
lines = [line.strip() for line in file]
return lines
# ------------------------------------------------------------------------------------------
def __str__(self):
"""
This private method determines how many of the lines are to be printed to
screen and pre-formats the data for printing.
"""
num_lines = min(self.print_lines, len(self.__lines))
return "\n".join(self.__lines[:num_lines])
# ==========================================================================================
# ==========================================================================================
# READ COLUMNAR DATA
[docs]def read_csv_columns_by_headers(
file_name: str, headers: dict[str, type], skip: int = 0
) -> pd.DataFrame:
"""
:param file_name: The file name to include path-link
:param headers: A dictionary of column names and their data types.
types are limited to ``numpy.int64``, ``numpy.float64``,
and ``str``
:param skip: The number of lines to be skipped before reading data
:return df: A pandas dataframe containing all relevant information
:raises FileNotFoundError: If the file is found to not exist
This function assumes the file has a comma (i.e. ,) delimiter, if
it does not, then it is not a true .csv file and should be transformed
to a text function and read by the read_text_columns_by_headers function.
Assume we have a .csv file titled ``test.csv`` with the following format.
.. list-table:: test.csv
:widths: 6 10 6 6
:header-rows: 1
* - ID,
- Inventory,
- Weight_per,
- Number
* - 1,
- Shoes,
- 1.5,
- 5
* - 2,
- t-shirt,
- 1.8,
- 3,
* - 3,
- coffee,
- 2.1,
- 15
* - 4,
- books,
- 3.2,
- 48
This file can be read via the following command
.. code-block:: python
from cobralib.io import read_csv_columns_by_headers
> file_name = 'test.csv'
> headers = {'ID': int, 'Inventory': str, 'Weight_per': float. 'Number': int}
> df = read_csv_columns_by_headers(file_name, headers)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
This function can also use the `skip` attributed read data when the
headers are not on the first line. For instance, assume the following csv file;
.. list-table:: test1.csv
:widths: 16 8 5 5
:header-rows: 0
* - This line is used to provide metadata for the csv file
-
-
-
* - This line is as well
-
-
-
* - ID,
- Inventory,
- Weight_per,
- Number
* - 1,
- Shoes,
- 1.5,
- 5
* - 2,
- t-shirt,
- 1.8,
- 3,
* - 3,
- coffee,
- 2.1,
- 15
* - 4,
- books,
- 3.2,
- 48
This file can be read via the following command
.. code-block:: python
from cobralib.io import read_csv_columns_by_headers
> file_name = 'test1.csv'
> headers = {'ID': int, 'Inventory': str, 'Weight_per': float, 'Number': int}
> df = read_csv_columns_by_headers(file_name, headers, skip=2)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
"""
if not os.path.isfile(file_name):
raise FileNotFoundError(f"File '{file_name}' not found")
head = list(headers.keys())
df = pd.read_csv(file_name, usecols=head, dtype=headers, skiprows=skip)
return df
# ----------------------------------------------------------------------------
[docs]def read_csv_columns_by_index(
file_name: str,
headers: dict[int, type],
col_names: list[str],
skip: int = 0,
) -> pd.DataFrame:
"""
:param file_name: The file name to include path-link
:param headers: A dictionary of column index and their data types.
types are limited to ``numpy.int64``, ``numpy.float64``,
and ``str``
:param col_names: A list containing the names to be given to
each column
:param skip: The number of lines to be skipped before reading data
:return df: A pandas dataframe containing all relevant information
:raises FileNotFoundError: If the file is found to not exist
This function assumes the file has a comma (i.e. ,) delimiter, if
it does not, then it is not a true .csv file and should be transformed
to a text function and read by the xx function. Assume we have a .csv
file titled ``test.csv`` with the following format.
.. list-table:: test.csv
:widths: 6 10 6 6
:header-rows: 0
* - 1,
- Shoes,
- 1.5,
- 5
* - 2,
- t-shirt,
- 1.8,
- 3,
* - 3,
- coffee,
- 2.1,
- 15
* - 4,
- books,
- 3.2,
- 48
This file can be read via the following command
.. code-block:: python
from cobralib.io import read_csv_columns_by_index
> file_name = 'test.csv'
> headers = {0: int, 1: str, 2: float, 3: int}
> names = ['ID', 'Inventory', 'Weight_per', 'Number']
> df = read_csv_columns_by_index(file_name, headers, names)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
This function can also use the `skip` attributed read data when the
headers are not on the first line. For instance, assume the following csv file;
.. list-table:: test1.csv
:widths: 16 8 5 5
:header-rows: 0
* - This line is used to provide metadata for the csv file
-
-
-
* - This line is as well
-
-
-
* - 1,
- Shoes,
- 1.5,
- 5
* - 2,
- t-shirt,
- 1.8,
- 3,
* - 3,
- coffee,
- 2.1,
- 15
* - 4,
- books,
- 3.2,
- 48
This file can be read via the following command
.. code-block:: python
from cobralib.io import read_csv_columns_by_index
> file_name = 'test1.csv'
> headers = {0: int, 1: str, 2: float, 3: int}
> names = ['ID', 'Inventory', 'Weight_per', 'Number']
> df = read_csv_columns_by_index(file_name, headers,
names, skip=2)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
"""
if not os.path.isfile(file_name):
raise FileNotFoundError(f"File '{file_name}' not found")
col_index = list(headers.keys())
df = pd.read_csv(
file_name, usecols=col_index, names=col_names, dtype=headers, skiprows=skip
)
return df
# ------------------------------------------------------------------------------------------
[docs]def read_text_columns_by_headers(
file_name: str,
headers: dict[str, type],
skip: int = 0,
delimiter=r"\s+",
) -> pd.DataFrame:
"""
:param file_name: The file name to include path-link
:param headers: A dictionary of column names and their data types.
types are limited to ``numpy.int64``, ``numpy.float64``,
and ``str``
:param skip: The number of lines to be skipped before reading data
:param delimiter: The type of delimiter separating data in the text file.
Defaulted to space delimited, where a space is one or
more white spaces. This function can use any delimiter,
to include a comma separation; however, a comma delimiter
should be a .csv file extension.
:return df: A pandas dataframe containing all relevant information
:raises FileNotFoundError: If the file is found to not exist
This function assumes the file has a space delimiter, if
Assume we have a .csv file titled ``test.txt`` with the following
format.
.. list-table:: test.txt
:widths: 6 10 6 6
:header-rows: 1
* - ID
- Inventory
- Weight_per
- Number
* - 1
- Shoes
- 1.5
- 5
* - 2
- t-shirt
- 1.8
- 3
* - 3
- coffee
- 2.1
- 15
* - 4
- books
- 3.2
- 48
This file can be read via the following command
.. code-block:: python
from cobralib.io import read_text_columns_by_headers
> file_name = 'test.txt'
> headers = {'ID': int, 'Inventory': str, 'Weight_per': float, 'Number': int}
> df = read_text_columns_by_headers(file_name, headers)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
This function can also use the `skip` attributed read data when the
headers are not on the first line. For instance, assume the following csv file;
.. list-table:: test.txt
:widths: 16 8 5 5
:header-rows: 0
* - This line is used to provide metadata for the csv file
-
-
-
* - This line is as well
-
-
-
* - ID
- Inventory
- Weight_per
- Number
* - 1
- Shoes
- 1.5
- 5
* - 2
- t-shirt
- 1.8
- 3
* - 3
- coffee
- 2.1
- 15
* - 4
- books
- 3.2
- 48
This file can be read via the following command
.. code-block:: python
from cobralib.io import read_text_columns_by_headers
> file_name = 'test.txt'
> headers = {'ID': int, 'Inventory': str, 'Weight_per': float, 'Number': int}
> df = read_text_columns_by_headers(file_name, headers, skip=2)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
"""
if not os.path.isfile(file_name):
raise FileNotFoundError(f"File '{file_name}' not found")
head = list(headers.keys())
df = pd.read_csv(file_name, usecols=head, dtype=headers, skiprows=skip, sep=delimiter)
return df
# --------------------------------------------------------------------------------
[docs]def read_text_columns_by_index(
file_name: str,
headers: dict[int, type],
col_names: list[str],
skip: int = 0,
delimiter=r"\s+",
) -> pd.DataFrame:
"""
:param file_name: The file name to include path-link
:param headers: A dictionary of column index` and their data types.
types are limited to ``numpy.int64``, ``numpy.float64``,
and ``str``
:param col_names: A list containing the names to be given to
each column
:param skip: The number of lines to be skipped before reading data
:param delimiter: The type of delimiter separating data in the text file.
Defaulted to space delimited, where a space is one or
more white spaces. This function can use any delimiter,
to include a comma separation; however, a comma delimiter
should be a .csv file extension.
:return df: A pandas dataframe containing all relevant information
:raises FileNotFoundError: If the file is found to not exist
Assume we have a .txt file titled ``test.txt`` with the following format.
.. list-table:: test.txt
:widths: 6 10 6 6
:header-rows: 0
* - 1
- Shoes
- 1.5
- 5
* - 2
- t-shirt
- 1.8
- 3
* - 3
- coffee
- 2.1
- 15
* - 4
- books
- 3.2
- 48
This file can be read via the following command
.. code-block:: python
from cobralib.io import read_text_columns_by_index
> file_name = 'test.txt'
> headers = {0: int, 1: str, 2: float, 3: int}
> names = [ headers = {'ID', 'Inventory', 'Weight_per', 'Number']
> df = read_text_columns_by_index(file_name, headers, names)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
This function can also use the `skip` attributed read data when the
headers are not on the first line. For instance, assume the following csv file;
.. list-table:: test.txt
:widths: 16 8 5 5
:header-rows: 0
* - This line is used to provide metadata for the csv file
-
-
-
* - This line is as well
-
-
-
* - ID
- Inventory
- Weight_per
- Number
* - 1
- Shoes
- 1.5
- 5
* - 2
- t-shirt
- 1.8
- 3
* - 3
- coffee
- 2.1
- 15
* - 4
- books
- 3.2
- 48
This file can be read via the following command
.. code-block:: python
from cobralib.io import read_text_columns_by_index
> file_name = 'test.txt'
> headers = {0: int, 1: str, 2: float, 3: int}
> names = ['ID', 'Inventory', 'Weight_per', 'Number']
> df = read_text_columns_by_index(file_name, headers,
names, skip=2)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
"""
if not os.path.isfile(file_name):
raise FileNotFoundError(f"File '{file_name}' not found")
head = list(headers.keys())
df = pd.read_csv(
file_name,
usecols=head,
names=col_names,
dtype=headers,
skiprows=skip,
sep=delimiter,
)
return df
# ------------------------------------------------------------------------------------------
[docs]def read_excel_columns_by_headers(
file_name: str, tab: str, headers: dict[str, type], skip: int = 0
) -> pd.DataFrame:
"""
:param file_name: The file name to include path-link. Must be an
.xls file format. This code will **not** read .xlsx
:param tab: The tab or sheet name that data will be read from
:param headers: A dictionary of column names and their data types.
types are limited to ``numpy.int64``, ``numpy.float64``,
and ``str``
:param skip: The number of lines to be skipped before reading data
:return df: A pandas dataframe containing all relevant information
:raises FileNotFoundError: If the file is found to not exist
Assume we have a .xls file titled ``test.xls`` with the following format
in a tab titled ``primary``.
.. list-table:: test.xls
:widths: 6 10 6 6
:header-rows: 1
* - ID
- Inventory
- Weight_per
- Number
* - 1
- Shoes
- 1.5
- 5
* - 2
- t-shirt
- 1.8
- 3
* - 3
- coffee
- 2.1
- 15
* - 4
- books
- 3.2
- 48
This file can be read via the following command
.. code-block:: python
> file_name = 'test.xls'
> tab = "primary"
> headers = {'ID': int, 'Inventory': str, 'Weight_per': float, 'Number': int}
> df = read_excel_columns_by_headers(file_name, tab, headers)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
This function can also use the `skip` attributed read data when the
headers are not on the first line. For instance, assume the following csv file;
.. list-table:: test.xls
:widths: 16 8 5 5
:header-rows: 0
* - This line is used to provide metadata for the csv file
-
-
-
* - This line is as well
-
-
-
* - ID
- Inventory
- Weight_per
- Number
* - 1
- Shoes
- 1.5
- 5
* - 2
- t-shirt
- 1.8
- 3
* - 3
- coffee
- 2.1
- 15
* - 4
- books
- 3.2
- 48
This file can be read via the following command
.. code-block:: python
from cobralib.io import read_excel_columns_by_headers
> file_name = 'test.xls'
> tab = "primary"
> headers = ['ID': int, 'Inventory': str, 'Weight_per': float, 'Number': int]
> df = read_excel_columns_by_headers(file_name, tab,
headers, skip=2)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
"""
if not os.path.isfile(file_name):
raise FileNotFoundError(f"File '{file_name}' not found")
head = list(headers.keys())
df = pd.read_excel(
file_name,
sheet_name=tab,
usecols=head,
dtype=headers,
skiprows=skip,
engine="openpyxl",
)
return df
# ----------------------------------------------------------------------------
[docs]def read_excel_columns_by_index(
file_name: str,
tab: str,
col_index: dict[int, str],
col_names: list[str],
skip: int = 0,
) -> pd.DataFrame:
"""
:param file_name: The file name to include path-link. Must be an
.xls file format. This code will **not** read .xlsx
:param tab: The tab or sheet name that data will be read from
:param col_index: A dictionary of column index` and their data types.
types are limited to ``numpy.int64``, ``numpy.float64``,
and ``str``
:param col_names: A list containing the names to be given to
each column
:param skip: The number of lines to be skipped before reading data
:return df: A pandas dataframe containing all relevant information
:raises FileNotFoundError: If the file is found to not exist
Assume we have a .txt file titled ``test.xls`` with the following format.
.. list-table:: test.xls
:widths: 6 10 6 6
:header-rows: 0
* - 1
- Shoes
- 1.5
- 5
* - 2
- t-shirt
- 1.8
- 3
* - 3
- coffee
- 2.1
- 15
* - 4
- books
- 3.2
- 48
This file can be read via the following command
.. code-block:: python
from cobralib.io import read_excel_columns_by_index
> file_name = 'test.xls'
> tab = 'primary'
> headers = {0: int, 1: str, 2: float, 3: int}
> names = ['ID', 'Inventory', 'Weight_per', 'Number']
> df = read_excel_columns_by_index(file_name, tab, headers, names)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
This function can also use the `skip` attributed read data when the
headers are not on the first line. For instance, assume the following csv file;
.. list-table:: test.xls
:widths: 16 8 5 5
:header-rows: 0
* - This line is used to provide metadata for the csv file
-
-
-
* - This line is as well
-
-
-
* - ID
- Inventory
- Weight_per
- Number
* - 1
- Shoes
- 1.5
- 5
* - 2
- t-shirt
- 1.8
- 3
* - 3
- coffee
- 2.1
- 15
* - 4
- books
- 3.2
- 48
This file can be read via the following command
.. code-block:: python
from cobralib.io import read_excel_columns_by_index
> file_name = 'test.xls'
> tab = "primary"
> headers = {0: int, 1: str, 2: float, 3: int}
> names = ['ID', 'Inventory', 'Weight_per', 'Number']
> df = read_excel_columns_by_index(file_name, tab, headers,
names, skip=2)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
"""
if not os.path.isfile(file_name):
raise FileNotFoundError(f"File '{file_name}' not found")
head = list(col_index.keys())
df = pd.read_excel(
file_name,
sheet_name=tab,
usecols=head,
names=col_names,
dtype=col_index,
skiprows=skip,
header=None,
engine="openpyxl",
)
return df
# ------------------------------------------------------------------------------------------
[docs]def read_pdf_columns_by_headers(
file_name: str,
headers: dict[str, type],
table_idx: int = 0,
page_num: int = 0,
skip: int = 0,
) -> pd.DataFrame:
"""
Read a table from a PDF document and save user-specified columns into a pandas
DataFrame. This function will read a pdf table that spans multiple
pages. **NOTE:** The pdf document must be a vectorized pdf document and not
a scan of another document for this function to work.
:param file_name: The file name to include the path-link to the PDF file.
:param headers: A dictionary of column names and their data types.
Data types are limited to ``int``, ``float``, and ``str``.
:param table_idx: Index of the table to extract from the page (default: 0).
:param page_num: Page number from which to extract the table (default: 0).
:param skip: The number of lines to be skipped before reading data
:return df: A pandas DataFrame containing the specified columns from the table.
:raises FileNotFoundError: If the PDF file is found to not exist.
Example usage:
.. code-block:: python
from cobralib.io import read_pdf_columns_by_headers
> file_name = 'test.pdf'
> headers = {'ID': int, 'Inventory': str, 'Weight_per': float, 'Number': int}
> df = read_pdf_columns_by_headers(file_name, headers, table_idx=0, page_num=1)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
"""
if not os.path.isfile(file_name):
raise FileNotFoundError(f"File '{file_name}' not found")
# Extract tables from the specified page of the PDF using pdfplumber
with pdfplumber.open(file_name) as pdf:
page = pdf.pages[page_num]
table = page.extract_tables()
if table_idx >= len(table):
raise ValueError(f"Table index {table_idx} out of range.")
# Convert the table to a pandas DataFrame
df = pd.DataFrame(table[table_idx][1:], columns=table[table_idx][0])
# Skip specified number of rows before reading the header
df = df.iloc[skip:]
# Filter out columns based on user-specified headers
selected_columns = [column for column in headers.keys() if column in df.columns]
df = df[selected_columns]
# Rename the columns to match the user-specified headers
df.columns = list(headers.keys())
# Convert the columns to the specified data types
for column, dtype in headers.items():
df[column] = df[column].astype(dtype)
return df
# ------------------------------------------------------------------------------------------
[docs]def read_pdf_columns_by_index(
file_name: str,
headers: dict[int, type],
col_names: list[str],
table_idx: int = 0,
skip_rows: int = 0,
page_num: int = 0,
) -> pd.DataFrame:
"""
Read a table from a PDF document and save user-specified columns into a pandas
DataFrame based on their column index. This function will read a pdf table that
spans multiple pages. **NOTE:** The pdf document must be a vectorized pdf
document and not a scan of another document for this function to work.
:param file_name: The file name to include the path-link to the PDF file.
:param headers: A dictionary of column index and their data types.
Data types are limited to ``int``, ``float``, and ``str``.
:param col_names: A list containing the names to be given to each column.
:param table_idx: Index of the table to extract from the page (default: 0).
:param skip_rows: Number of rows to skip before reading the header row (default: 0).
:param page_num: Page number from which to extract the table (default: 0).
:return df: A pandas DataFrame containing the specified columns from the table.
:raises FileNotFoundError: If the PDF file is found to not exist.
Example usage:
.. code-block:: python
from cobralib.io import read_pdf_columns_by_index
> file_name = 'test.pdf'
> headers = {0: int, 1: str, 2: float, 3: int}
> col_names = ['ID', 'Inventory', 'Weight_per', 'Number'] # Column names
> df = read_pdf_columns_by_index(file_name, headers, col_names,
table_idx=0, skip_rows=2, page_num=1)
> print(df)
ID Inventory Weight_per Number
0 1 shoes 1.5 5
1 2 t-shirt 1.8 3
2 3 coffee 2.1 15
3 4 books 3.2 40
"""
if not os.path.isfile(file_name):
raise FileNotFoundError(f"File '{file_name}' not found")
# Extract tables from the specified page of the PDF using pdfplumber
with pdfplumber.open(file_name) as pdf:
page = pdf.pages[page_num]
table = page.extract_tables()
if table_idx >= len(table):
raise ValueError(f"Table index {table_idx} out of range.")
# Convert the table to a pandas DataFrame
df = pd.DataFrame(table[table_idx][1:], columns=table[table_idx][0])
# Skip specified number of rows before reading the header
df = df.iloc[skip_rows:]
# Filter out columns based on user-specified column indices
selected_columns = [
col_idx for col_idx in headers.keys() if col_idx < len(df.columns)
]
df = df.iloc[:, selected_columns]
# Rename the columns with user-specified column names
df.columns = col_names[: len(selected_columns)]
dat_type = list(headers.values())
name_dict = dict(zip(col_names, dat_type))
# Convert the columns to the specified data types
for column, dtype in name_dict.items():
df[column] = df[column].astype(dtype)
return df
# ==========================================================================================
# ==========================================================================================
# READ AND WRITE TO YAML
[docs]def write_yaml_file(file_path: str, data: dict, append: bool = False) -> None:
"""
Write or append data to a YAML file.
:param file_path: The path of the YAML file
:param data: The data to be written or appended as a dictionary
:param append: True to append data to the file, False to overwrite
the file or create a new one (default: False)
:raises FileNotFoundError: If the file does not exist in append mode
.. code-block:: python
from corbalib.io import write_yaml_file
dict_file = {'sports' : ['soccer', 'football', 'basketball',
'cricket', 'hockey', 'table tennis']},
{'countries' : ['Pakistan', 'USA', 'India',
'China', 'Germany', 'France', 'Spain']}
# Create new yaml file
write_yaml_file('new_file.yaml', data, dict_file, append=False)
This will create a file titled new_file.yaml with the following contents
.. literalinclude:: ../../../data/test/output.yaml
:language: text
"""
mode = "a" if append else "w"
if append and not os.path.exists(file_path):
raise FileNotFoundError(f"File '{file_path}' not found.")
try:
with open(file_path, mode) as file:
if append:
file.write("---\n") # Add YAML document separator
yaml.safe_dump(data, file)
except OSError as e:
print(f"Error writing to file: {e}")
# ==========================================================================================
# ==========================================================================================
[docs]class Logger:
"""
Custom logging class that writes messages to both console and log file.
:param filename: The name of the file to write logs to.
:param console_level: The minimum logging level for the console. Should be one
of: 'NOTSET', 'DEBUG', 'INFO', 'WARNING', 'ERROR',
'CRITICAL'.
:param file_level: The minimum logging level for the log file. Should be one of:
'NOTSET', 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'.
:param max_lines: The maximum number of lines in the log file. When exceeded,
the oldest entries are deleted.
:raises ValueError: If `console_level` or `file_level` are not valid logging
levels.
:raises IOError: If an I/O error occurs when opening the file.
**Example usage:**
.. code-block:: python
# create logger with filename='my_log.log', console_level='INFO',
# file_level='DEBUG', and max_lines=100
logger = Logger('my_log.log', 'INFO', 'DEBUG', 100)
# log a DEBUG message
logger.log('DEBUG', 'This is a debug message')
# log an INFO message
logger.log('INFO', 'This is an info message')
"""
def __init__(self, filename, console_level, file_level, max_lines):
self.filename = filename
self.max_lines = max_lines
# Creating logger
self.logger = logging.getLogger(filename)
self.logger.setLevel(logging.DEBUG)
# Creating console handler and setting its level
ch = logging.StreamHandler()
ch.setLevel(self._str_to_log_level(console_level))
# Creating file handler and setting its level
fh = logging.handlers.RotatingFileHandler(filename, backupCount=1)
fh.setLevel(self._str_to_log_level(file_level))
# Creating formatter
fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
formatter = logging.Formatter(fmt)
# Setting formatter for ch and fh
ch.setFormatter(formatter)
fh.setFormatter(formatter)
# Adding ch and fh to logger
self.logger.addHandler(ch)
self.logger.addHandler(fh)
# ------------------------------------------------------------------------------------------
def _str_to_log_level(self, level):
"""
Convert string representation of logging level to corresponding
logging module constants.
:param level: The string representation of the logging level.
:return: Corresponding logging level.
"""
levels = {
"NOTSET": logging.NOTSET,
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
"CRITICAL": logging.CRITICAL,
}
return levels.get(level, logging.NOTSET)
# ------------------------------------------------------------------------------------------
[docs] def log(self, level, msg):
"""
Write a log entry.
:param level: The level of the log entry. Should be one of: 'NOTSET',
'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'.
:param msg: The message to be logged.
:raises ValueError: If `level` is not a valid logging level.
"""
self.logger.log(self._str_to_log_level(level), msg)
self._trim_log_file()
def _trim_log_file(self):
"""
Trims the log file to the last `max_lines` entries.
:raises IOError: If an I/O error occurs when trying to trim the file.
"""
try:
with open(self.filename, "r+") as f:
lines = deque(f, self.max_lines)
f.seek(0)
f.writelines(lines)
f.truncate()
except OSError:
self.logger.exception("Error while trimming log file")
# ==========================================================================================
# ==========================================================================================
# eof