feat(bulk-ingest): Add --ignored Flag to Exclude Specific Files and Directories During Ingestion (#1432)
This commit is contained in:
		
							parent
							
								
									24fae660e6
								
							
						
					
					
						commit
						b178b51451
					
				|  | @ -1,4 +1,6 @@ | |||
| .venv | ||||
| .env | ||||
| venv | ||||
| 
 | ||||
| settings-me.yaml | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										17
									
								
								Makefile
								
								
								
								
							
							
						
						
									
										17
									
								
								Makefile
								
								
								
								
							|  | @ -56,3 +56,20 @@ wipe: | |||
| 
 | ||||
| setup: | ||||
| 	poetry run python scripts/setup | ||||
| 
 | ||||
| list: | ||||
| 	@echo "Available commands:" | ||||
| 	@echo "  test            : Run tests using pytest" | ||||
| 	@echo "  test-coverage   : Run tests with coverage report" | ||||
| 	@echo "  black           : Check code format with black" | ||||
| 	@echo "  ruff            : Check code with ruff" | ||||
| 	@echo "  format          : Format code with black and ruff" | ||||
| 	@echo "  mypy            : Run mypy for type checking" | ||||
| 	@echo "  check           : Run format and mypy commands" | ||||
| 	@echo "  run             : Run the application" | ||||
| 	@echo "  dev-windows     : Run the application in development mode on Windows" | ||||
| 	@echo "  dev             : Run the application in development mode" | ||||
| 	@echo "  api-docs        : Generate API documentation" | ||||
| 	@echo "  ingest          : Ingest data using specified script" | ||||
| 	@echo "  wipe            : Wipe data using specified script" | ||||
| 	@echo "  setup           : Setup the application" | ||||
|  |  | |||
|  | @ -20,20 +20,20 @@ class LocalIngestWorker: | |||
| 
 | ||||
|         self._files_under_root_folder: list[Path] = list() | ||||
| 
 | ||||
|     def _find_all_files_in_folder(self, root_path: Path) -> None: | ||||
|     def _find_all_files_in_folder(self, root_path: Path, ignored: list[str]) -> None: | ||||
|         """Search all files under the root folder recursively. | ||||
|         Count them at the same time | ||||
|         """ | ||||
|         for file_path in root_path.iterdir(): | ||||
|             if file_path.is_file(): | ||||
|             if file_path.is_file() and file_path.name not in ignored: | ||||
|                 self.total_documents += 1 | ||||
|                 self._files_under_root_folder.append(file_path) | ||||
|             elif file_path.is_dir(): | ||||
|                 self._find_all_files_in_folder(file_path) | ||||
|             elif file_path.is_dir() and file_path.name not in ignored: | ||||
|                 self._find_all_files_in_folder(file_path, ignored) | ||||
| 
 | ||||
|     def ingest_folder(self, folder_path: Path) -> None: | ||||
|     def ingest_folder(self, folder_path: Path, ignored: list[str]) -> None: | ||||
|         # Count total documents before ingestion | ||||
|         self._find_all_files_in_folder(folder_path) | ||||
|         self._find_all_files_in_folder(folder_path, ignored) | ||||
|         self._ingest_all(self._files_under_root_folder) | ||||
| 
 | ||||
|     def _ingest_all(self, files_to_ingest: list[Path]) -> None: | ||||
|  | @ -64,12 +64,19 @@ parser.add_argument( | |||
|     action=argparse.BooleanOptionalAction, | ||||
|     default=False, | ||||
| ) | ||||
| parser.add_argument( | ||||
|     "--ignored", | ||||
|     nargs="*", | ||||
|     help="List of files/directories to ignore", | ||||
|     default=[], | ||||
| ) | ||||
| parser.add_argument( | ||||
|     "--log-file", | ||||
|     help="Optional path to a log file. If provided, logs will be written to this file.", | ||||
|     type=str, | ||||
|     default=None, | ||||
| ) | ||||
| 
 | ||||
| args = parser.parse_args() | ||||
| 
 | ||||
| # Set up logging to a file if a path is provided | ||||
|  | @ -91,9 +98,17 @@ if __name__ == "__main__": | |||
| 
 | ||||
|     ingest_service = global_injector.get(IngestService) | ||||
|     worker = LocalIngestWorker(ingest_service) | ||||
|     worker.ingest_folder(root_path) | ||||
|     worker.ingest_folder(root_path, args.ignored) | ||||
| 
 | ||||
|     if args.ignored: | ||||
|         logger.info(f"Skipping following files and directories: {args.ignored}") | ||||
| 
 | ||||
|     if args.watch: | ||||
|         logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...") | ||||
|         directories_to_watch = [ | ||||
|             dir | ||||
|             for dir in root_path.iterdir() | ||||
|             if dir.is_dir() and dir.name not in args.ignored | ||||
|         ] | ||||
|         watcher = IngestWatcher(args.folder, worker.ingest_on_watch) | ||||
|         watcher.start() | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue