Jalankan fungsi pada setiap elemen di kolom daftar dataframe

Aug 21 2020

Yang ini sedikit rumit bagiku.

Dataframe:

parent           children
0   MAX          [MAX, amx, akd]
1   Sam          ['Sam','sammy','samsam']
2   Larry        ['lar','lair','larrylamo']

Saya memiliki fungsi di mana jika saya mengirimkan hanya sebuah string, itu akan membandingkan dua string, dan mencetak angka yang menggambarkan seberapa dekat karakter (dalam jarak). Mirip dengan persamaan levenshtein.

Bagaimana cara menjalankan fungsi ini pada dataframe? Saya perlu membandingkan setiap record di kolom pertama ('parent) dengan daftar yang sesuai di kolom kedua (' children ')?

Saat ini, saya bisa menjalankan ini dan mendapatkan hasil ini:

>>> reference = 'larry'
>>> value_list = ['lar','lair','larrylamo']
>>> get_top_matches(reference,value_list)
>>> [('lar',0.91),('larrylamo',0.91),('lair',0.83)]

Saya mencoba membuat kolom ketiga yang terbuat dari tupel untuk setiap baris pertandingan, seperti ini:

parent           children                     func_results
0   MAX          [MAX, amx, akd]              [('MAX',1.0),('amx',0.89),('akd',0.56)]
1   Sam          ['Sam','sammy','samsam']     [('Sam',1.0),('sammy',0.91), ('samsam',0.88)]
2   Larry        ['lar','lair','larrylamo']   [('lar',0.91),('larrylamo',0.91), ('lair',0.83)]

Saya pikir fungsinya harus dapat bekerja sebagaimana adanya jika saya bisa mencari cara untuk menerapkannya dalam loop for terhadap df.


inilah fungsinya:

import math
import re

def sort_token_alphabetically(word):
    token = re.split('[,. ]', word)
    sorted_token = sorted(token)
    return ' '.join(sorted_token)

def get_jaro_distance(first, second, winkler=True, winkler_ajustment=True,
                      scaling=0.1, sort_tokens=True):
    if sort_tokens:
        first = sort_token_alphabetically(first)
        second = sort_token_alphabetically(second)

    if not first or not second:
        raise JaroDistanceException(
            "Cannot calculate distance from NoneType ({0}, {1})".format(
                first.__class__.__name__,
                second.__class__.__name__))

    jaro = _score(first, second)
    cl = min(len(_get_prefix(first, second)), 4)

    if all([winkler, winkler_ajustment]):  # 0.1 as scaling factor
        return round((jaro + (scaling * cl * (1.0 - jaro))) * 100.0) / 100.0

    return jaro

def _score(first, second):
    shorter, longer = first.lower(), second.lower()

    if len(first) > len(second):
        longer, shorter = shorter, longer

    m1 = _get_matching_characters(shorter, longer)
    m2 = _get_matching_characters(longer, shorter)

    if len(m1) == 0 or len(m2) == 0:
        return 0.0

    return (float(len(m1)) / len(shorter) +
            float(len(m2)) / len(longer) +
            float(len(m1) - _transpositions(m1, m2)) / len(m1)) / 3.0

def _get_diff_index(first, second):
    if first == second:
        pass

    if not first or not second:
        return 0

    max_len = min(len(first), len(second))
    for i in range(0, max_len):
        if not first[i] == second[i]:
            return i

    return max_len

def _get_prefix(first, second):
    if not first or not second:
        return ""

    index = _get_diff_index(first, second)
    if index == -1:
        return first

    elif index == 0:
        return ""

    else:
        return first[0:index]

def _get_matching_characters(first, second):
    common = []
    limit = math.floor(min(len(first), len(second)) / 2)

    for i, l in enumerate(first):
        left, right = int(max(0, i - limit)), int(
            min(i + limit + 1, len(second)))
        if l in second[left:right]:
            common.append(l)
            second = second[0:second.index(l)] + '*' + second[
                                                       second.index(l) + 1:]

    return ''.join(common)

def _transpositions(first, second):
    return math.floor(
        len([(f, s) for f, s in zip(first, second) if not f == s]) / 2.0)

def get_top_matches(reference, value_list, max_results=None):
    scores = []
    if not max_results:
        max_results = len(value_list)
    for val in value_list:
        score_sorted = get_jaro_distance(reference, val)
        score_unsorted = get_jaro_distance(reference, val, sort_tokens=False)
        scores.append((val, max(score_sorted, score_unsorted)))
    scores.sort(key=lambda x: x[1], reverse=True)

    return scores[:max_results]

class JaroDistanceException(Exception):
    def __init__(self, message):
        super(Exception, self).__init__(message)


reference = 'larry'
value_list = ['lar','lair','larrylamo']
get_top_matches(reference, value_list)

Jawaban

1 AndyL. Aug 21 2020 at 04:05

Saya berasumsi dataset Anda yang sebenarnya memiliki 2 kolom yang tepat sebagai sampel Anda. Gunakan aggpada axis = 1

df['func_results'] = df.agg(lambda x: get_top_matches(*x), axis=1)


Out[366]:
  parent  ...                                    func_results
0    MAX  ...          [(MAX, 1.0), (amx, 0.89), (akd, 0.56)]
1    Sam  ...     [(Sam, 1.0), (sammy, 0.87), (samsam, 0.83)]
2  Larry  ...  [(lar, 0.87), (larrylamo, 0.85), (lair, 0.78)]

[3 rows x 3 columns]