I have a program that reads lines randomly from a file, and uses threading. The problem is that whenever it reads the lines from a file, it sometimes reads a duplicate line from the file. For instance, let's say I use 5 threads and my file looks like this:
line1
line2
line3
line4
line5
The program uses threading to read the lines randomly, but sometimes it can read line4, line3, line5, line2, line5 (again). So my question is how would I get rid of the line5 being a duplicate?
Code:
def get_token():
tokens = []
with open('pokens.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
tokens.append(line.replace('\n', ''))
return tokens
def get_proxy():
proxies = []
with open('proxies.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
proxies.append(line.replace('\n', ''))
return proxies
class Gen:
def __init__(self, token, proxy=None):
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
proxy_ip_port = proxy
proxy2 = Proxy()
proxy2.proxy_type = ProxyType.MANUAL
proxy2.http_proxy = proxy_ip_port
proxy2.ssl_proxy = proxy_ip_port
capabilities = webdriver.DesiredCapabilities.CHROME
proxy2.add_to_capabilities(capabilities)
self.browser = webdriver.Chrome("chromedriver.exe")
self.token = token
self.proxy = proxy
self.password = 'passwordhere'
def register(self):
print('hi')
# Code continues with no duplicates
def worker(proxy=None):
token_list = get_token()
token = random.choice(token_list)
d = Gen(token, proxy=proxy)
d.register()
def main():
threads = []
num_thread = input('Number of Threads: ')
num_thread = int(num_thread)
proxies = get_proxy()
for i in range(num_thread):
t = threading.Thread(target=worker, args= (random.choice(proxies), ))
threads.append(t)
t.start()
if __name__ == '__main__':
main()
CodePudding user response:
Below is a simplified "toy version" of your program that I updated to do the following:
- Read the tokens-file from the main thread, into a list
- Randomly shuffle the order of the list
- Give each worker a roughly-equally-sized subset of the tokens-list for it to choose from
- Each worker merely prints out the data that it was given by the main thread (actually doing anything with the data is omitted, for clarity)
This approach avoid duplicates because any given token appears in the list only once, and each thread has been given a different subset of the list to choose tokens from.
import threading
import random
def read_tokens_list():
tokens = []
with open('pokens.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
tokens.append(line.replace('\n', ''))
return tokens
def read_proxies_list():
proxies = []
with open('proxies.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
proxies.append(line.replace('\n', ''))
return proxies
def worker(proxy,token_list):
token = random.choice(token_list)
print("Worker: my proxy is [%s], my token list is %s, I've chosen [%s] as my token" % (proxy, token_list, token))
def main():
threads = []
num_thread = input('Number of Threads: ')
num_thread = int(num_thread)
proxies = read_proxies_list()
token_list = read_tokens_list() # read in the pokens.txt file
random.shuffle(token_list) # shuffle the list into random order
tokens_per_worker = len(token_list) // num_thread # how many tokens from the list each worker will get (roughly)
for i in range(num_thread):
if ((i 1)<num_thread):
num_tokens_for_this_worker = tokens_per_worker # give each worker an even share of the list
else:
num_tokens_for_this_worker = len(token_list) # except the last worker gets whatever is left
# we'll give the first (num_tokens_for_this_worker) tokens in the list to this worker
tokens_for_this_worker = token_list[0:num_tokens_for_this_worker]
# and remove those tokens from the list so that they won't get used by anyone else
token_list = token_list[num_tokens_for_this_worker:]
t = threading.Thread(target=worker, args=(random.choice(proxies), tokens_for_this_worker, ))
threads.append(t)
t.start()
for t in threads:
t.join()
if __name__ == '__main__':
main()