In [2]:
import json
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from unidecode import unidecode

## Scraping and preprocessing names

In [3]:
def extract_names(url: str):
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    names = soup.find_all('p')[2:]
    people = []
    for name in names:
        people += name.text.split(", ")    
    return people


In [4]:
men = extract_names("https://www.krstne-mena.eu/zoznam-slovenskych-muzskych-mien/")
women = extract_names("https://www.krstne-mena.eu/zoznam-slovenskych-zenskych-mien/")

In [5]:
men_decoded = [unidecode(name).lower() for name in men]
women_decoded = [unidecode(name).lower() for name in women]

scraped_names = men_decoded.copy() + women_decoded.copy()
additional_names = [
    'vlado', 'stano', 'anno', 'dodo', 'jozo', 'veronika', 'oksana', 'miro', 'mirko', 'mato', 'matko', 'tino', 'margita', 'stevo', 'ella', 'veronka'
    'laci', 'miso', 'teo', 'paul', 'pavel', 'dodi', 'lubo', 'peto', 'dana', 'fero', 'viera', 'rasto', 'mirka', 'janik', 'antionia', 'olivia', 'janko'
    'rado', 'petka', 'jozka', 'feri', 'karla', 'juro', 'pato', 'milo', 'laci', 'miriam', 'paulie', 'petr', 'misko', 'tomino', 'janko', 'mariana', 'kubo'
    'slavo', 'mata', 'jaro', 'tony', 'marko', 'robo', 'matka', 'janka', 'slavko', 'lacko', 'vlasto', 'pali', 'evulka', 'zlata', 'lucka', 'rudo', 'majo', 'petr'
    ,'katka', 'evka', 'brano', 'zdeno', 'zuzka', 'simonka', 'karin', 'tina', 'nikolas', 'gabor', 'lubosko', 'janci', 'danko', 'tana', 'aneta', 'lubka', 'csaba'
    ,'nada', 'simon', 'palo', 'alex', 'tomi', 'hela', 'juro'
]

scraped_names += additional_names

In [6]:
'alex' in scraped_names

True

## Importing names from the dataset

In [7]:
with open("json_data/users.json", "r", encoding="utf-8") as file:
    loaded_data = json.load(file)

In [8]:
decoded_load = [unidecode(name).lower() for name in loaded_data.keys()]
new_loaded_data = {key: loaded_data[original_key] for original_key, key in zip(loaded_data.keys(), decoded_load)}

In [8]:
i = 0
valid_names = {}

for name in tqdm(new_loaded_data.keys()):
    if name.split(' ')[0] in scraped_names:
        valid_names[name] = f"user{i}"
        i+= 1
    else:
        try:
            if name.split(' ')[1] in scraped_names:
                valid_names[name] = f"user{i}"
                i+= 1
        except IndexError:
            pass
        my_input = input(f"Is valid: '{name}'? ")
        if my_input == "":
            valid_names[name] = f"user{i}"
            i+= 1

100%|██████████| 46582/46582 [59:47<00:00, 12.98it/s]   


In [120]:
count = 0
for name in tqdm(new_loaded_data.keys()):
    if name.split(' ')[0] in scraped_names:
        count += 1
    else: 
        try:
            if name.split(' ')[1] in scraped_names:
                count+= 1
            else:
                raise IndexError
        except IndexError:
            pass

100%|██████████| 46582/46582 [00:00<00:00, 423533.72it/s]


In [121]:
count

35853

In [62]:
len(loaded_data)

47254

In [65]:
for name in new_loaded_data.keys():
    if name.split(" ")[0] not in scraped_names:
        print(name)

legnaw engerauer
usi mi sako
kmandalas mkaradakis
nasta velebirova
skromne
nika gaspar homolova
metwo ditwo
turan simi
ka tus
laci zakhar
michael miklosko
miju wlcak
agape leto
lude vit
dezole mudrosti
anna uhrova
tony tony
nati farkas
rodina darvasi
fuchsleitner martin
hopkins andrej
renka ruttmarova
milo volosin
juro pavol
pato zicho
sob rudolf szekely
maja petrova
palo vitovic
palko tomasik
kac ka
ronko lacko
thomson jd
don john
slavo slavomir novotny
grune teufel
la ci
brano bielik
izidorik dvorik
paulie gabris
dylan obskuriak
daisha tea modranska
janko nerob
anna ha
miriam odlerova
ga bi ka
petr tresne
phill morris
plantaznik
misko benko
solsagan
ruz jur
jurai gonsor
antonin tonny dvorak
duro bulo
michalides klapi
johny ef
henika pozsonyiova
andre tawares
tomino jasek
mike smith
nick nikus pompova
zomri
gomi addams
rob ko
janka hasbach
mamca zgauca
rudy gulas
jakob forman
mitat greben
janko berezny
zuzu schickhofer
veronka sasakova simkova
billas reggeli
sam sebou
jozko mrkvicka
j

In [16]:
# valid_names
with open("json_data/users_delete2.json", "w", encoding="utf-8") as json_file:
    json.dump(valid_names, json_file, indent=4, separators=(',', ': '), ensure_ascii=False)
# with open("json_data/users_delete1.json", "r") as file:
#     valid_names = json.load(file)


In [11]:
keys_to_delete = []
for key in tqdm(valid_names.keys()):
    if len(key) < 7:
        x = input(key)
        if x == '':
            keys_to_delete.append(key)

100%|██████████| 46381/46381 [01:36<00:00, 479.24it/s] 


In [14]:
for key in keys_to_delete:
    del valid_names[key]

In [17]:
valid_names

{'klaudia rosicova': 'user0',
 'legnaw engerauer': 'user1',
 'marek kopilec': 'user2',
 'martin dvorecky': 'user3',
 'andrej kovac': 'user4',
 'usi mi sako': 'user5',
 'margita pergerova': 'user6',
 'peter baranek': 'user7',
 'karol koronczi': 'user8',
 'stefan vidlar': 'user9',
 'tibor nemcok': 'user10',
 'marcel bulik': 'user11',
 'katarina weissova': 'user12',
 'tomas olsansky': 'user13',
 'jozef keseli': 'user14',
 'nasta velebirova': 'user15',
 'ella med orechova': 'user16',
 'michal szentivanyi': 'user17',
 'skromne': 'user18',
 'nika gaspar homolova': 'user19',
 'matus mocny': 'user20',
 'simona vrabikova': 'user21',
 'lubomir tatran': 'user22',
 'peter bozan': 'user23',
 'eva halaskova kucerova': 'user24',
 'metwo ditwo': 'user25',
 'peter cernecky': 'user26',
 'turan simi': 'user27',
 'marian prievoznik': 'user28',
 'robert beno': 'user29',
 'lubo rando': 'user30',
 'peter galajda': 'user32',
 'michal pazak': 'user33',
 'tomas kramar': 'user34',
 'katarina kami ivanicova': 'us