import tarfile from ast import literal_eval from rich.progress import track from pathlib import Path import pandas as pd def tar_file_to_string(filename): with tarfile.open(filename, "r:gz") as tar: for member in tar.getmembers(): f = tar.extractfile(member) data = f.readline() data = data.decode("utf-8") data = data.split("{'url'") data = [("{'url'" + item) for item in data] data = data[1:] return data if __name__=="__main__": data = Path('../HEAD') for tar_gz in data.iterdir(): filename = tar_gz.name.split('.tar.gz')[0] print(f"Now extracting {filename}") text = tar_file_to_string(tar_gz) filtered = [] for item in track(text): try: if literal_eval(item)['language_score'] > 0.98: filtered.append(literal_eval(item)) except: None filtered = pd.DataFrame(filtered) filtered.to_json(f'../HEAD_CLEAN/{filename}.jsonl', orient='records', lines=True)