Adibvafa commited on
Commit
afcde13
·
1 Parent(s): 7844d6e

Improve style

Browse files
Files changed (1) hide show
  1. scripts/pdf_to_hf_dataset.py +83 -76
scripts/pdf_to_hf_dataset.py CHANGED
@@ -29,52 +29,52 @@ from datasets import Dataset
29
 
30
  class PDFToHFConverter:
31
  """Converter for PDF files to HuggingFace dataset format."""
32
-
33
  def __init__(self, chunk_size: int = 1500, chunk_overlap: int = 300):
34
  """Initialize the converter with chunking configuration."""
35
  self.chunk_size = chunk_size
36
  self.chunk_overlap = chunk_overlap
37
-
38
  # Define text splitting separators
39
  separators = [
40
  "\n\n", # Double newlines (paragraphs)
41
- "\n", # Single newlines
42
- ". ", # Sentences
43
- "? ", # Questions
44
- "! ", # Exclamations
45
- "; ", # Semicolons
46
- ", ", # Commas
47
- " ", # Spaces
48
- "" # Characters
49
  ]
50
-
51
  self.text_splitter = RecursiveCharacterTextSplitter(
52
  chunk_size=chunk_size,
53
  chunk_overlap=chunk_overlap,
54
  separators=separators,
55
  length_function=len,
56
  )
57
-
58
  def process_pdf(self, pdf_path: str) -> List[Dict[str, Any]]:
59
  """Process a single PDF file and return chunks with metadata."""
60
  try:
61
  print(f"Processing: {pdf_path}")
62
-
63
  # Load PDF
64
  loader = PyPDFLoader(pdf_path)
65
  documents = loader.load()
66
-
67
  if not documents:
68
  print(f"Warning: No content extracted from {pdf_path}")
69
  return []
70
-
71
  # Combine all pages into one document for better chunking
72
  full_text = "\n\n".join([doc.page_content for doc in documents])
73
-
74
  # Extract title (filename without extension)
75
  filename = Path(pdf_path).name
76
  title = Path(pdf_path).stem
77
-
78
  # Create a single document for chunking
79
  combined_doc = Document(
80
  page_content=full_text,
@@ -82,27 +82,27 @@ class PDFToHFConverter:
82
  "source": pdf_path,
83
  "title": title,
84
  "filename": filename,
85
- "total_pages": len(documents)
86
- }
87
  )
88
-
89
  # Split into chunks
90
  chunks = self.text_splitter.split_documents([combined_doc])
91
-
92
  # Convert to HF format
93
  hf_chunks = []
94
  for i, chunk in enumerate(chunks):
95
  # Create unique ID using hash of content + position
96
  content_hash = hashlib.md5(chunk.page_content.encode()).hexdigest()[:8]
97
  chunk_id = f"{Path(pdf_path).stem}_{i:04d}_{content_hash}"
98
-
99
  # Clean content
100
  content = chunk.page_content.strip()
101
-
102
  # Skip very short chunks
103
  if len(content) < 100:
104
  continue
105
-
106
  hf_chunk = {
107
  "id": chunk_id,
108
  "title": title,
@@ -111,47 +111,48 @@ class PDFToHFConverter:
111
  "filename": filename,
112
  "chunk_index": i,
113
  "total_chunks": len(chunks),
114
- "chunk_size": len(content)
115
  }
116
-
117
  hf_chunks.append(hf_chunk)
118
-
119
  print(f"Created {len(hf_chunks)} chunks from {pdf_path}")
120
  return hf_chunks
121
-
122
  except Exception as e:
123
  print(f"Error processing {pdf_path}: {str(e)}")
124
  return []
125
-
126
- def process_directory(self, input_dir: str, output_dir: str,
127
- output_format: str = "json") -> None:
 
128
  """Process all PDFs in a directory and save in HF format."""
129
  input_path = Path(input_dir)
130
  output_path = Path(output_dir)
131
  output_path.mkdir(parents=True, exist_ok=True)
132
-
133
  # Find all PDF files
134
  pdf_files = list(input_path.glob("**/*.pdf"))
135
-
136
  if not pdf_files:
137
  print(f"No PDF files found in {input_dir}")
138
  return
139
-
140
  print(f"Found {len(pdf_files)} PDF files to process")
141
-
142
  all_chunks = []
143
-
144
  # Process each PDF
145
  for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
146
  chunks = self.process_pdf(str(pdf_path))
147
  all_chunks.extend(chunks)
148
-
149
  if not all_chunks:
150
  print("No chunks were created from any PDFs")
151
  return
152
-
153
  print(f"Total chunks created: {len(all_chunks)}")
154
-
155
  # Save in requested format
156
  if output_format.lower() == "json":
157
  self.save_as_json(all_chunks, output_path)
@@ -164,53 +165,49 @@ class PDFToHFConverter:
164
  else:
165
  print(f"Unsupported format: {output_format}")
166
  return
167
-
168
  # Also save metadata
169
  self.save_metadata(all_chunks, output_path)
170
-
171
  print(f"Dataset saved to {output_path}")
172
  print(f"Ready for HuggingFace upload!")
173
-
174
  def save_as_json(self, chunks: List[Dict[str, Any]], output_path: Path) -> None:
175
  """Save chunks as JSON file."""
176
  output_file = output_path / "dataset.json"
177
- with open(output_file, 'w', encoding='utf-8') as f:
178
  json.dump(chunks, f, indent=2, ensure_ascii=False)
179
  print(f"Saved JSON: {output_file}")
180
-
181
  def save_as_jsonl(self, chunks: List[Dict[str, Any]], output_path: Path) -> None:
182
  """Save chunks as JSONL file."""
183
  output_file = output_path / "dataset.jsonl"
184
- with open(output_file, 'w', encoding='utf-8') as f:
185
  for chunk in chunks:
186
  json.dump(chunk, f, ensure_ascii=False)
187
- f.write('\n')
188
  print(f"Saved JSONL: {output_file}")
189
-
190
  def save_as_parquet(self, chunks: List[Dict[str, Any]], output_path: Path) -> None:
191
  """Save chunks as Parquet file."""
192
  # Create minimal version for HF (only required fields)
193
  hf_data = [
194
- {
195
- "id": chunk["id"],
196
- "title": chunk["title"],
197
- "content": chunk["content"]
198
- }
199
  for chunk in chunks
200
  ]
201
-
202
  df = pd.DataFrame(hf_data)
203
  output_file = output_path / "dataset.parquet"
204
  df.to_parquet(output_file, index=False)
205
  print(f"Saved Parquet: {output_file}")
206
-
207
  def save_as_csv(self, chunks: List[Dict[str, Any]], output_path: Path) -> None:
208
  """Save chunks as CSV file."""
209
  df = pd.DataFrame(chunks)
210
  output_file = output_path / "dataset.csv"
211
- df.to_csv(output_file, index=False, encoding='utf-8')
212
  print(f"Saved CSV: {output_file}")
213
-
214
  def save_metadata(self, chunks: List[Dict[str, Any]], output_path: Path) -> None:
215
  """Save dataset metadata and statistics."""
216
  metadata = {
@@ -220,36 +217,46 @@ class PDFToHFConverter:
220
  "chunk_size_config": self.chunk_size,
221
  "chunk_overlap_config": self.chunk_overlap,
222
  "sources": list(set(chunk["source"] for chunk in chunks)),
223
- "titles": list(set(chunk["title"] for chunk in chunks))
224
  }
225
-
226
  metadata_file = output_path / "metadata.json"
227
- with open(metadata_file, 'w', encoding='utf-8') as f:
228
  json.dump(metadata, f, indent=2, ensure_ascii=False)
229
  print(f"Saved metadata: {metadata_file}")
230
 
 
231
  if __name__ == "__main__":
232
  """Main function to run the converter."""
233
  parser = argparse.ArgumentParser(description="Convert PDF files to HuggingFace dataset format")
234
  parser.add_argument("--input_dir", "-i", required=True, help="Directory containing PDF files")
235
  parser.add_argument("--output_dir", "-o", required=True, help="Output directory for dataset")
236
- parser.add_argument("--format", "-f", default="parquet",
237
- choices=["json", "jsonl", "parquet", "csv"],
238
- help="Output format (default: parquet)")
239
- parser.add_argument("--chunk_size", "-c", type=int, default=1500,
240
- help="Chunk size for text splitting (default: 1500)")
241
- parser.add_argument("--chunk_overlap", "-ol", type=int, default=300,
242
- help="Chunk overlap for text splitting (default: 300)")
243
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  args = parser.parse_args()
245
-
246
  # Create converter and process
247
- converter = PDFToHFConverter(
248
- chunk_size=args.chunk_size,
249
- chunk_overlap=args.chunk_overlap
250
- )
251
  converter.process_directory(
252
- input_dir=args.input_dir,
253
- output_dir=args.output_dir,
254
- output_format=args.format
255
- )
 
29
 
30
  class PDFToHFConverter:
31
  """Converter for PDF files to HuggingFace dataset format."""
32
+
33
  def __init__(self, chunk_size: int = 1500, chunk_overlap: int = 300):
34
  """Initialize the converter with chunking configuration."""
35
  self.chunk_size = chunk_size
36
  self.chunk_overlap = chunk_overlap
37
+
38
  # Define text splitting separators
39
  separators = [
40
  "\n\n", # Double newlines (paragraphs)
41
+ "\n", # Single newlines
42
+ ". ", # Sentences
43
+ "? ", # Questions
44
+ "! ", # Exclamations
45
+ "; ", # Semicolons
46
+ ", ", # Commas
47
+ " ", # Spaces
48
+ "", # Characters
49
  ]
50
+
51
  self.text_splitter = RecursiveCharacterTextSplitter(
52
  chunk_size=chunk_size,
53
  chunk_overlap=chunk_overlap,
54
  separators=separators,
55
  length_function=len,
56
  )
57
+
58
  def process_pdf(self, pdf_path: str) -> List[Dict[str, Any]]:
59
  """Process a single PDF file and return chunks with metadata."""
60
  try:
61
  print(f"Processing: {pdf_path}")
62
+
63
  # Load PDF
64
  loader = PyPDFLoader(pdf_path)
65
  documents = loader.load()
66
+
67
  if not documents:
68
  print(f"Warning: No content extracted from {pdf_path}")
69
  return []
70
+
71
  # Combine all pages into one document for better chunking
72
  full_text = "\n\n".join([doc.page_content for doc in documents])
73
+
74
  # Extract title (filename without extension)
75
  filename = Path(pdf_path).name
76
  title = Path(pdf_path).stem
77
+
78
  # Create a single document for chunking
79
  combined_doc = Document(
80
  page_content=full_text,
 
82
  "source": pdf_path,
83
  "title": title,
84
  "filename": filename,
85
+ "total_pages": len(documents),
86
+ },
87
  )
88
+
89
  # Split into chunks
90
  chunks = self.text_splitter.split_documents([combined_doc])
91
+
92
  # Convert to HF format
93
  hf_chunks = []
94
  for i, chunk in enumerate(chunks):
95
  # Create unique ID using hash of content + position
96
  content_hash = hashlib.md5(chunk.page_content.encode()).hexdigest()[:8]
97
  chunk_id = f"{Path(pdf_path).stem}_{i:04d}_{content_hash}"
98
+
99
  # Clean content
100
  content = chunk.page_content.strip()
101
+
102
  # Skip very short chunks
103
  if len(content) < 100:
104
  continue
105
+
106
  hf_chunk = {
107
  "id": chunk_id,
108
  "title": title,
 
111
  "filename": filename,
112
  "chunk_index": i,
113
  "total_chunks": len(chunks),
114
+ "chunk_size": len(content),
115
  }
116
+
117
  hf_chunks.append(hf_chunk)
118
+
119
  print(f"Created {len(hf_chunks)} chunks from {pdf_path}")
120
  return hf_chunks
121
+
122
  except Exception as e:
123
  print(f"Error processing {pdf_path}: {str(e)}")
124
  return []
125
+
126
+ def process_directory(
127
+ self, input_dir: str, output_dir: str, output_format: str = "json"
128
+ ) -> None:
129
  """Process all PDFs in a directory and save in HF format."""
130
  input_path = Path(input_dir)
131
  output_path = Path(output_dir)
132
  output_path.mkdir(parents=True, exist_ok=True)
133
+
134
  # Find all PDF files
135
  pdf_files = list(input_path.glob("**/*.pdf"))
136
+
137
  if not pdf_files:
138
  print(f"No PDF files found in {input_dir}")
139
  return
140
+
141
  print(f"Found {len(pdf_files)} PDF files to process")
142
+
143
  all_chunks = []
144
+
145
  # Process each PDF
146
  for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
147
  chunks = self.process_pdf(str(pdf_path))
148
  all_chunks.extend(chunks)
149
+
150
  if not all_chunks:
151
  print("No chunks were created from any PDFs")
152
  return
153
+
154
  print(f"Total chunks created: {len(all_chunks)}")
155
+
156
  # Save in requested format
157
  if output_format.lower() == "json":
158
  self.save_as_json(all_chunks, output_path)
 
165
  else:
166
  print(f"Unsupported format: {output_format}")
167
  return
168
+
169
  # Also save metadata
170
  self.save_metadata(all_chunks, output_path)
171
+
172
  print(f"Dataset saved to {output_path}")
173
  print(f"Ready for HuggingFace upload!")
174
+
175
  def save_as_json(self, chunks: List[Dict[str, Any]], output_path: Path) -> None:
176
  """Save chunks as JSON file."""
177
  output_file = output_path / "dataset.json"
178
+ with open(output_file, "w", encoding="utf-8") as f:
179
  json.dump(chunks, f, indent=2, ensure_ascii=False)
180
  print(f"Saved JSON: {output_file}")
181
+
182
  def save_as_jsonl(self, chunks: List[Dict[str, Any]], output_path: Path) -> None:
183
  """Save chunks as JSONL file."""
184
  output_file = output_path / "dataset.jsonl"
185
+ with open(output_file, "w", encoding="utf-8") as f:
186
  for chunk in chunks:
187
  json.dump(chunk, f, ensure_ascii=False)
188
+ f.write("\n")
189
  print(f"Saved JSONL: {output_file}")
190
+
191
  def save_as_parquet(self, chunks: List[Dict[str, Any]], output_path: Path) -> None:
192
  """Save chunks as Parquet file."""
193
  # Create minimal version for HF (only required fields)
194
  hf_data = [
195
+ {"id": chunk["id"], "title": chunk["title"], "content": chunk["content"]}
 
 
 
 
196
  for chunk in chunks
197
  ]
198
+
199
  df = pd.DataFrame(hf_data)
200
  output_file = output_path / "dataset.parquet"
201
  df.to_parquet(output_file, index=False)
202
  print(f"Saved Parquet: {output_file}")
203
+
204
  def save_as_csv(self, chunks: List[Dict[str, Any]], output_path: Path) -> None:
205
  """Save chunks as CSV file."""
206
  df = pd.DataFrame(chunks)
207
  output_file = output_path / "dataset.csv"
208
+ df.to_csv(output_file, index=False, encoding="utf-8")
209
  print(f"Saved CSV: {output_file}")
210
+
211
  def save_metadata(self, chunks: List[Dict[str, Any]], output_path: Path) -> None:
212
  """Save dataset metadata and statistics."""
213
  metadata = {
 
217
  "chunk_size_config": self.chunk_size,
218
  "chunk_overlap_config": self.chunk_overlap,
219
  "sources": list(set(chunk["source"] for chunk in chunks)),
220
+ "titles": list(set(chunk["title"] for chunk in chunks)),
221
  }
222
+
223
  metadata_file = output_path / "metadata.json"
224
+ with open(metadata_file, "w", encoding="utf-8") as f:
225
  json.dump(metadata, f, indent=2, ensure_ascii=False)
226
  print(f"Saved metadata: {metadata_file}")
227
 
228
+
229
  if __name__ == "__main__":
230
  """Main function to run the converter."""
231
  parser = argparse.ArgumentParser(description="Convert PDF files to HuggingFace dataset format")
232
  parser.add_argument("--input_dir", "-i", required=True, help="Directory containing PDF files")
233
  parser.add_argument("--output_dir", "-o", required=True, help="Output directory for dataset")
234
+ parser.add_argument(
235
+ "--format",
236
+ "-f",
237
+ default="parquet",
238
+ choices=["json", "jsonl", "parquet", "csv"],
239
+ help="Output format (default: parquet)",
240
+ )
241
+ parser.add_argument(
242
+ "--chunk_size",
243
+ "-c",
244
+ type=int,
245
+ default=1500,
246
+ help="Chunk size for text splitting (default: 1500)",
247
+ )
248
+ parser.add_argument(
249
+ "--chunk_overlap",
250
+ "-ol",
251
+ type=int,
252
+ default=300,
253
+ help="Chunk overlap for text splitting (default: 300)",
254
+ )
255
+
256
  args = parser.parse_args()
257
+
258
  # Create converter and process
259
+ converter = PDFToHFConverter(chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap)
 
 
 
260
  converter.process_directory(
261
+ input_dir=args.input_dir, output_dir=args.output_dir, output_format=args.format
262
+ )