Module adash.string_util

Expand source code
import hashlib
import re
import unicodedata
from typing import Any


def replace_all(s: str, obj: dict) -> str:
    """複数のreplace
    Example:
        >>> _obj = {"円": ".", "銭": ""}
        >>> replace_all("3円00銭", _obj)
        '3.00'
        >>> _obj = {"[△▲]": "-", "[,、]": ""}
        >>> replace_all('▲12,345', _obj)
        '-12345'
        >>> replace_all('△12、345', _obj)
        '-12345'
    """
    for key in obj:
        val = obj[key]
        s = re.sub(key, val, s)
    return s


def to_half_string(s: str) -> str:
    """normalize
    Example:
        >>> to_half_string('123XYZ')
        '123XYZ'
    """
    return unicodedata.normalize("NFKC", s)


def to_number(s: str, default: Any = float("NAN")) -> Any:
    """文字列を数値化
    Example:
        >>> to_number('△12,345')
        -12345
        >>> to_number('12,345')
        12345
        >>> to_number('12円34銭')
        12.34
        >>> to_number('98%')
        98
        >>> to_number('abc')
        nan
        >>> to_number('abc', '123')
        '123'
    """
    rep_dict = {"[△▲Δ]": "-", "[,、銭%%]": "", "[円]": "."}
    s = to_half_string(s)
    s = replace_all(s, rep_dict)
    try:
        float(s)
    except ValueError:
        return default
    else:
        if float(s).is_integer():
            return int(float(s))
        return float(s)


def to_date(s: str) -> Any:
    """日付表記を統一する
    Example:
        >>> to_date('2018年5月27日')
        '2018-05-27'
        >>> to_date('令和 元年 5月12日')
        '2019-05-12'
        >>> to_date('大3年5月27日')
        '1914-05-27'
        >>> to_date('20211010')
        '2021-10-10'
    """
    meiji = "meiji"
    taisyou = "taisyou"
    syouwa = "syouwa"
    heisei = "heisei"
    reiwa = "reiwa"
    s = to_half_string(s)
    s = replace_all(
        s,
        {
            "[年月/]": "-",
            r"[日\s]": "",
            "[元]": "1",
            r"(明治|明)": meiji,
            r"(大正|大)": taisyou,
            r"(昭和|昭)": syouwa,
            r"(平成|平)": heisei,
            r"(令和|令)": reiwa,
        },
    )
    # 数字8桁ならハイフン挿入
    m = re.match(r"^\d{8}$", s)
    if m:
        s = re.sub(r"(\d{4})(\d{2})(\d{2})", r"\1-\2-\3", s)
    m = re.match(r"(\D*)(\d*)-(\d*)-(\d*)", s)
    if m:
        era, year, month, day = m.groups()
        if era:
            if era == meiji:
                year = int(year) + 1868 - 1
            elif era == taisyou:
                year = int(year) + 1912 - 1
            elif era == syouwa:
                year = int(year) + 1926 - 1
            elif era == heisei:
                year = int(year) + 1989 - 1
            elif era == reiwa:
                year = int(year) + 2019 - 1
            year = str(year)
        month = f"0{month}" if len(month) == 1 else month
        day = f"0{day}" if len(day) == 1 else day
        return f"{year}-{month}-{day}"
    return None


def split_uppercase(s: str) -> list:
    """UpperCaseを分割
    Example:
        >>> split_uppercase('NextAccumulatedQ2Duration')
        ['Next', 'Accumulated', 'Q2', 'Duration']
    """
    return re.findall(r"[A-Z]+[a-z0-9]*", s)


def to_md5(s: str):
    """文字列をmd5化
    Example:
        >>> to_md5('test')
        '098f6bcd4621d373cade4e832627b4f6'
    """
    return hashlib.md5(s.encode()).hexdigest()


def to_sha256(s: str):
    """文字列をsha256化
    Example:
        >>> to_sha256('test')
        '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08'
    """
    return hashlib.sha256(s.encode()).hexdigest()


def text_normalize(text: str, exclude_chars: str = "①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳") -> str:
    """文字列の正規化
    Example:
        >>> text_normalize("ハンカクカタカナzenkaku123")
        'ハンカクカタカナzenkaku123'
        >>> text_normalize("①②③123")
        '①②③123'
        >>> text_normalize("①②③123", exclude_chars="①1")
        '①23123'
    """
    normalized_text = []
    for char in text:
        if char in exclude_chars:
            normalized_text.append(char)
        else:
            normalized_text.append(unicodedata.normalize("NFKC", char))
    return "".join(normalized_text)

Functions

def replace_all(s: str, obj: dict) ‑> str

複数のreplace

Example

>>> _obj = {"円": ".", "銭": ""}
>>> replace_all("3円00銭", _obj)
'3.00'
>>> _obj = {"[△▲]": "-", "[,、]": ""}
>>> replace_all('▲12,345', _obj)
'-12345'
>>> replace_all('△12、345', _obj)
'-12345'
Expand source code
def replace_all(s: str, obj: dict) -> str:
    """複数のreplace
    Example:
        >>> _obj = {"円": ".", "銭": ""}
        >>> replace_all("3円00銭", _obj)
        '3.00'
        >>> _obj = {"[△▲]": "-", "[,、]": ""}
        >>> replace_all('▲12,345', _obj)
        '-12345'
        >>> replace_all('△12、345', _obj)
        '-12345'
    """
    for key in obj:
        val = obj[key]
        s = re.sub(key, val, s)
    return s
def split_uppercase(s: str) ‑> list

UpperCaseを分割

Example

>>> split_uppercase('NextAccumulatedQ2Duration')
['Next', 'Accumulated', 'Q2', 'Duration']
Expand source code
def split_uppercase(s: str) -> list:
    """UpperCaseを分割
    Example:
        >>> split_uppercase('NextAccumulatedQ2Duration')
        ['Next', 'Accumulated', 'Q2', 'Duration']
    """
    return re.findall(r"[A-Z]+[a-z0-9]*", s)
def text_normalize(text: str, exclude_chars: str = '①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳') ‑> str

文字列の正規化

Example

>>> text_normalize("ハンカクカタカナzenkaku123")
'ハンカクカタカナzenkaku123'
>>> text_normalize("①②③123")
'①②③123'
>>> text_normalize("①②③123", exclude_chars="①1")
'①23123'
Expand source code
def text_normalize(text: str, exclude_chars: str = "①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳") -> str:
    """文字列の正規化
    Example:
        >>> text_normalize("ハンカクカタカナzenkaku123")
        'ハンカクカタカナzenkaku123'
        >>> text_normalize("①②③123")
        '①②③123'
        >>> text_normalize("①②③123", exclude_chars="①1")
        '①23123'
    """
    normalized_text = []
    for char in text:
        if char in exclude_chars:
            normalized_text.append(char)
        else:
            normalized_text.append(unicodedata.normalize("NFKC", char))
    return "".join(normalized_text)
def to_date(s: str) ‑> Any

日付表記を統一する

Example

>>> to_date('2018年5月27日')
'2018-05-27'
>>> to_date('令和 元年 5月12日')
'2019-05-12'
>>> to_date('大3年5月27日')
'1914-05-27'
>>> to_date('20211010')
'2021-10-10'
Expand source code
def to_date(s: str) -> Any:
    """日付表記を統一する
    Example:
        >>> to_date('2018年5月27日')
        '2018-05-27'
        >>> to_date('令和 元年 5月12日')
        '2019-05-12'
        >>> to_date('大3年5月27日')
        '1914-05-27'
        >>> to_date('20211010')
        '2021-10-10'
    """
    meiji = "meiji"
    taisyou = "taisyou"
    syouwa = "syouwa"
    heisei = "heisei"
    reiwa = "reiwa"
    s = to_half_string(s)
    s = replace_all(
        s,
        {
            "[年月/]": "-",
            r"[日\s]": "",
            "[元]": "1",
            r"(明治|明)": meiji,
            r"(大正|大)": taisyou,
            r"(昭和|昭)": syouwa,
            r"(平成|平)": heisei,
            r"(令和|令)": reiwa,
        },
    )
    # 数字8桁ならハイフン挿入
    m = re.match(r"^\d{8}$", s)
    if m:
        s = re.sub(r"(\d{4})(\d{2})(\d{2})", r"\1-\2-\3", s)
    m = re.match(r"(\D*)(\d*)-(\d*)-(\d*)", s)
    if m:
        era, year, month, day = m.groups()
        if era:
            if era == meiji:
                year = int(year) + 1868 - 1
            elif era == taisyou:
                year = int(year) + 1912 - 1
            elif era == syouwa:
                year = int(year) + 1926 - 1
            elif era == heisei:
                year = int(year) + 1989 - 1
            elif era == reiwa:
                year = int(year) + 2019 - 1
            year = str(year)
        month = f"0{month}" if len(month) == 1 else month
        day = f"0{day}" if len(day) == 1 else day
        return f"{year}-{month}-{day}"
    return None
def to_half_string(s: str) ‑> str

normalize

Example

>>> to_half_string('123XYZ')
'123XYZ'
Expand source code
def to_half_string(s: str) -> str:
    """normalize
    Example:
        >>> to_half_string('123XYZ')
        '123XYZ'
    """
    return unicodedata.normalize("NFKC", s)
def to_md5(s: str)

文字列をmd5化

Example

>>> to_md5('test')
'098f6bcd4621d373cade4e832627b4f6'
Expand source code
def to_md5(s: str):
    """文字列をmd5化
    Example:
        >>> to_md5('test')
        '098f6bcd4621d373cade4e832627b4f6'
    """
    return hashlib.md5(s.encode()).hexdigest()
def to_number(s: str, default: Any = nan) ‑> Any

文字列を数値化

Example

>>> to_number('△12,345')
-12345
>>> to_number('12,345')
12345
>>> to_number('12円34銭')
12.34
>>> to_number('98%')
98
>>> to_number('abc')
nan
>>> to_number('abc', '123')
'123'
Expand source code
def to_number(s: str, default: Any = float("NAN")) -> Any:
    """文字列を数値化
    Example:
        >>> to_number('△12,345')
        -12345
        >>> to_number('12,345')
        12345
        >>> to_number('12円34銭')
        12.34
        >>> to_number('98%')
        98
        >>> to_number('abc')
        nan
        >>> to_number('abc', '123')
        '123'
    """
    rep_dict = {"[△▲Δ]": "-", "[,、銭%%]": "", "[円]": "."}
    s = to_half_string(s)
    s = replace_all(s, rep_dict)
    try:
        float(s)
    except ValueError:
        return default
    else:
        if float(s).is_integer():
            return int(float(s))
        return float(s)
def to_sha256(s: str)

文字列をsha256化

Example

>>> to_sha256('test')
'9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08'
Expand source code
def to_sha256(s: str):
    """文字列をsha256化
    Example:
        >>> to_sha256('test')
        '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08'
    """
    return hashlib.sha256(s.encode()).hexdigest()