Source code for urlchecker.core.fileproc

"""

Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat

This source code is licensed under the terms of the MIT license.
For a copy, see <https://opensource.org/licenses/MIT>.

"""

import fnmatch
import os
import re
from typing import List

from urlchecker.core import urlmarker


[docs]def check_file_type(file_path: str, file_types: List[str]) -> bool: """ Check file type to assert that only file with certain predefined extensions are checked. We currently support an extension verbatim, or regular expression to match the filename. For example, .* matches all hidden files, and *.html matches an html file. Args: - file_path (str) : path to file. - file_types (list) : list of file extensions to accept. Returns: (bool) true if file type is supported else false. """ ftype = "." + file_path.split(".")[-1] if ftype in file_types: return True # The user can also provide a regular expression if any(fnmatch.fnmatch(file_path, x) for x in file_types): return True # default return return False
[docs]def include_file( file_path: str, exclude_patterns: List[str] = None, include_patterns: List[str] = None, ) -> bool: """ Check a file path for inclusion based on an OR regular expression. The user is currently not notified if a file is marked for removal. Args: - file_path (str) : a file path to check if should be included. - exclude_patterns (list) : list of patterns to exclude. - include_patterns (list) : list of patterns to include. Returns: (bool) boolean indicating if the URL should be excluded (not tested). """ include_patterns = include_patterns or [] exclude_patterns = exclude_patterns or [] # No excluded patterns, all files are included if not exclude_patterns and not include_patterns: return True # Create a regular expression for each exclude_regexp = "(%s)" % "|".join(exclude_patterns) include_regexp = "(%s)" % "|".join(include_patterns) # Return False (don't include) if excluded if not include_patterns: return not bool(re.search(exclude_regexp, file_path)) # We have an include_patterns only elif not exclude_patterns: return bool(re.search(include_regexp, file_path)) # If both defined, excluded takes preference return bool(re.search(include_regexp, file_path)) and not bool( re.search(exclude_regexp, file_path) )
[docs]def get_file_paths( base_path: str, file_types: List[str], exclude_files: List[str] = None, include_patterns: List[str] = None, ) -> List[str]: """ Get path to all files under a give directory and its subfolders. Args: - base_path (str) : base path. - file_types (list) : list of file extensions to accept. - include_patterns (list) : list of files and patterns to include. - exclude_files (list) : list of files or patterns to exclude Returns: (list) list of file paths. """ exclude_files = exclude_files or [] include_patterns = include_patterns or [] # init paths file_paths = [] # walk folders and colect file paths for root, directory, files in os.walk(base_path): file_paths += [ os.path.join(root, file) for file in files if os.path.isfile(os.path.join(root, file)) and check_file_type(file, file_types) and include_file(os.path.join(root, file), exclude_files, include_patterns) ] return file_paths
[docs]def remove_empty(file_list: List[str]) -> List[str]: """ Given a file list, return only those that aren't empty string or None. Args: - file_list (list): a list of files to remove None or empty string from. Returns: (list) list of (non None or empty string) contents. """ return [x for x in file_list if x not in ["", None]]