In Python you always work with texts. And you want to do many things with texts. You want to join them, split them, transform them, search them, save them and load them.
Below are text related utilities I developed over time
import pathlib
import re
import logging
from typing import List
from collections import Counter
from gpwrap.utils.configdict.configDict import ConfigDict
def split_text_by_certain_substring_and_save(long_text, split_str, filepath: pathlib.Path):
lines = long_text.split(split_str)
filepath.write_text("\n\n".join(lines))
for line in lines:
print(line)
print("\n\n")
return lines
def join_lines_with_keyword(keyword, lines):
return f"{keyword}".join(lines)
def format_sql_query(sql_keywords, query):
for keyword in sql_keywords:
lines = re.split(keyword, query)
query = join_lines_with_keyword(f"\n{keyword}\n\t", lines)
return query
def extract_subtext_from_big_text(big_text_lines, line_one, line_two):
start_idx = big_text_lines.index(line_one)
end_idx = big_text_lines.index(line_two)
return big_text_lines[start_idx:end_idx + 1]
def save_lines_to_file(file: pathlib.Path, lines: List):
file.write_text("\n".join(lines))
logging.info(f"Saved {len(lines)} lines to {file}")
def combine_lines_to_string(lines, join_char="\n"):
return f"{join_char}".join(lines)
def extract_switches_values(text):
"""
why are we wrapping the part before \s+ in parantheses
The parentheses are used to define a capturing group in the regular expression. A capturing group captures the text matched by the group for later use, such as extracting it as a separate item from the match.
In this case, the first capturing group (-[a-zA-Z]+) matches the switch, which begins with a hyphen and is followed by one or more letters. The second capturing group (\S+) matches the value that follows the switch, which is one or more non-whitespace characters.
By wrapping each of these parts in a capturing group, we can extract both the switch and its value as separate items from each match. The re.findall function returns a list of all matches, where each match is a tuple of the capturing groups' values in the order they are defined in the pattern.
Therefore, switch_value_regex.findall(text) returns a list of tuples, where each tuple contains the switch and its value, which we can then convert to a dictionary using dict for easier access.
:param text:
:return:
"""
switch_value_regex = re.compile(r'(-[a-zA-Z]+)\s+(\S+)')
switches_values = switch_value_regex.findall(text)
return dict(switches_values)
def remove_duplicate_lines_in_text(original_file: pathlib.Path, refined_filename: str = None, save_results=True):
"""
Remove duplicate lines in the file and save it to the same folder with new filename
:param original_file:
:param refined_filename:
:param save_results:
:return:
"""
log_folder = original_file.parent
if not refined_filename:
refined_filename = original_file.stem + "_refined" + original_file.suffix
txt = original_file.read_text()
lines = txt.split("\n")
cnt = Counter(lines)
new_file = log_folder / refined_filename
new_file.write_text("\n".join(cnt.keys()))
logging.info(f"reduced number of lines in {original_file} from {len(lines)} to {len(cnt)}")
return ConfigDict({"new_file": new_file, "newtxtcnt": cnt})