Best Python code snippet using lemoncheesecake
mhs_layout_analisys.py
Source:mhs_layout_analisys.py  
1import cv22import numpy as np3from utils import conditional_save, get_conditional_path4def cc_analisys(img) -> 'tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]':5    '''Find connected components and extract features from them.6    Get the connected components and their: area, density, bounding box, inner7    CCs and height/width rate.8    Args:9        img (cv2 image): inverse binary image.10    11    Returns:12        tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: area,13        density, bounding box, number of inner CCs and height/width rate for each14        connected component.15    '''16    n, _, cc, _ = cv2.connectedComponentsWithStats(img, connectivity=8, ltype=cv2.CV_32S)17    ### Análise dos Componentes Conexos18    area = np.zeros(n, dtype=np.int)19    density = np.zeros(n, dtype=np.float)20    rect = np.zeros((n, 4), dtype=np.int)21    inc = np.zeros(n, dtype=np.int)22    hw_rate = np.zeros(n)23    for i in range(1, n):24        h = cc[i, cv2.CC_STAT_HEIGHT]25        w = cc[i, cv2.CC_STAT_WIDTH]26        area[i] = cc[i, cv2.CC_STAT_AREA]27        density[i] = area[i] / (w*h)28        hw_rate[i] = min(w, h) / max(w, h)29        rect[i, [0,1]] = cc[i, [cv2.CC_STAT_LEFT, cv2.CC_STAT_TOP]]30        rect[i, [2,3]] = [w, h]31    for i in range(1, n):32        contained = (rect[:, 0] >= rect[i, 0]) & (rect[:, 0] + rect[:, 2] <= rect[i, 0] + rect[i, 2]) & (rect[:, 1] >= rect[i, 1]) & (rect[:, 1] + rect[:, 3] <= rect[i, 1] + rect[i, 3])33        contained[i] = False34        contained = contained & (area >= area[i] * 0.05)35        inc[i] = contained.sum()36    37    return area, density, rect, inc, hw_rate38def heuristic_filter(img, area: np.ndarray, density: np.ndarray, rect: np.ndarray, inc: np.ndarray, hw_rate: np.ndarray) -> 'tuple[np.ndarray, np.ndarray]':39    ''' Apply a heuristic filter to remove non-text elements from an image.40    Use the heuristic filter defined by (Tran et al. 2017) to identify and41    remove non-text elements from an image.42    Args:43        img (cv2 image): inverse binary image44        area (np.ndarray): areas (number of pixels) of the CCs45        density (np.ndarray): density of the CCs46        rect (np.ndarray): bounding boxes of the CCs47        inc (np.ndarray): number of contained CCs48        hw_rate (np.ndarray): height/width rate of the CCs49    Returns:50        tuple[np.ndarray, np.ndarray]: the image without the non-text elements,51        and a boolean mask for the text CCs.52    '''53    is_text = np.full(rect.shape[0], True, dtype=np.bool8)54    is_text = is_text & (rect[:, 0] > 0)55    is_text = is_text & (rect[:, 1] > 0)56    is_text = is_text & (rect[:, 0] + rect[:, 2] < img.shape[1])57    is_text = is_text & (rect[:, 0] + rect[:, 3] < img.shape[0])58    is_text = is_text & (inc <= 4)59    is_text = is_text & (area >= 20)60    is_text = is_text & ~((hw_rate < 0.1))# & (rect[:, 3] < rect[:, 2]))61    is_text = is_text & (density >= 0.06)62    # is_text = is_text & (density <= 0.9)63    out = img.copy() * 064    for x,y,w,h in rect[is_text]:65        out[y:y+h, x:x+w] = img[y:y+h, x:x+w]66    return out, is_text67def get_gradient(R, s: int, axis: int = 1, t: int = 0) -> np.ndarray:68    '''Calculate the gradient for the projection on the image69    Using the method outlined in (Tran et al. 2016), calculate the gradient of70    the horizontal (axis=0) or vertical (axis=1) projection.71    Args:72        R (cv2 image): region to calculate the gradient for73        s (int): smoothing parameter; window to smooth the projection74        axis (int): axis to project75        t (int): maximum number of pixels in a row to consider the row black76    Returns:77        np.ndarray: gradient of the projection78    '''79    ph = np.sum(R > 0, axis)80    ph[ph<t] = 081    zh = np.zeros_like(ph)82    # s = int(ph.shape[0] * 0.05)83    for x in range(zh.shape[0]):84        i = max(x - s, 0)85        j = min(x + s, zh.shape[0])86        zh[x] = np.floor(np.sum(ph[i:j] / (2*s)))87    if zh.shape[0] < 2:88        return np.array([0])89    gh = np.round(np.gradient(zh, edge_order=1)).astype(np.int)90    91    return gh92def check_homogeneity(R, s: int, axis: int = 1, t: int = 0) -> bool:93    '''Check if a region is homogeneous.94    Using the method outlined in (Tran et al. 2016), calculate the homogeneity95    structure of the region.96    Args:97        R (cv2 image): region to calculate the gradient for98        s (int): smoothing parameter; window to smooth the projection99        axis (int): axis to project100        t (int): maximum number of pixels in a row to consider the row black101    Returns:102        bool: whether the region is homogeneous103    '''104    gh = get_gradient(R, s, axis, t=t)105    lh = [t for t in range(gh.shape[0]-1) if (gh[t] < 0 and gh[t+1] >= 0) or (gh[t] > 0 and gh[t] <= 0)]106    delta = np.array([lh[i+1] - lh[i] for i in range(len(lh)-1)])107    if delta.shape[0] > 0:108        v = np.var(delta)109        return v <= 50110    111    return True112def get_lines(R, axis: int, t: int = 0) -> 'tuple[tuple[list[int], list[int]], tuple[list[int], list[int]]]':113    '''Find the black and white lines of a region.114    Use the horizontal or vertical projection to find black lines and white115    lines in the region, respecting the threshold.116    Args:117        R (cv2 image): region to find the lines118        axis (int): axis to project119        t (int): maximum number of pixels in a row to consider the row a white line120    121    Returns:122        tuple[tuple[list[int], list[int]], tuple[list[int], list[int]]]: index123        and heights of the white lines and black lines found.124    '''125    p = np.sum(R > 0, axis=axis)126    flags = np.zeros_like(p, dtype=np.bool)127    heights = np.zeros_like(p)128    prev = p[0]129    # flag = True -> black line130    flags = p > t131    heights[0] = 1132    133    for i in range(1, p.shape[0]):134        if (p[i] <= t and prev <= t) or (p[i] > t and prev > t):135            heights[i] = heights[i-1] + 1136        else:137            heights[i] = 1138        139        prev = p[i]140    141    white = []142    black = []143    white_heights = []144    black_heights = []145    bounds = [b for b in np.argwhere(heights == 1).flatten()] + [heights.shape[0]]146    for b in range(len(bounds) - 1):147        start, end = bounds[b], bounds[b+1]148        if flags[start]:149            black.append((end + start) // 2)150            black_heights.append(np.max(heights[start:end]))151        else:152            white.append((end + start) // 2)153            white_heights.append(np.max(heights[start:end]))154        155    return (white, white_heights), (black, black_heights)156def find_last_before(white: 'list[int]', x: int) -> int:157    '''Find the last white line before a certain position.158    Args:159        white (list[int]): list of white lines160        x (int): position161    162    Returns:163        int: the index for the last white line before x, -1 if no white line exists before x164    '''165    k = -1166    for i in range(len(white)):167        if white[i] < x:168            k = i169        else:170            break171    return k172def get_division(R, axis: int, t: int = 0) -> 'list[tuple[int, int]]':173    '''Calculates the positions to divide the region.174    Use the height of black and white lines in the region to calculate the cutting point.175    Args:176        R (cv2 image): region to find the lines177        axis (int): axis to project178        t (int): maximum number of pixels in a row to consider the row a white line179    Returns:180        list[tuple[int, int]]: list of cuts to make along the specified axis181    '''182    (white, white_heights), (black, black_heights) = get_lines(R, axis, t)183    184    wi = np.argwhere((white_heights == np.max(white_heights)) & (white_heights > np.median(white_heights))).flatten() if len(white) > 0 else np.array([])185    bi = np.argwhere((black_heights == np.max(black_heights)) & (black_heights > np.median(black_heights))).flatten() if len(black) > 0 else np.array([])186    div = []187    wdiv = []188    bdiv = []189    if wi.shape[0] > 0: # white division190        prev = 0191        for w in wi:192            wdiv.append((prev, white[w] - white_heights[w] // 2))193            prev = white[w] + white_heights[w] // 2194        wdiv.append((prev, R.shape[1-axis]))195    if bi.shape[0] > 0: # black division196        prev = 0197        for b in bi:198            i = find_last_before(white, black[b])199            if i != -1:200                first = white[i]201                second = white[i+1] if i+1 < len(white) else first202                first = white[b] if b < len(white) else white[-1]203                second = white[b+1] if b+1 < len(white) else white[-1]204                if first == second:205                    bdiv.append((prev, first - white_heights[i] // 2))206                    prev = first + white_heights[i] // 2207                else:208                    bdiv.append((prev, first - white_heights[i] // 2))209                    bdiv.append((first + white_heights[i] // 2, second - white_heights[i+1] // 2))210                    prev = second211        if prev > 0:212            bdiv.append((prev, R.shape[1-axis]))213            214    divs = []215    for d in wdiv + bdiv:216        divs.extend(d)217    divs = sorted(list(set(divs))) # remove duplicates and sort218    divs = [(divs[i], divs[i+1]) for i in range(len(divs)-1)]219    220    return divs221def recursive_splitting(img, rect: np.ndarray, is_text: np.ndarray, area: np.ndarray, t: float = 0.01, do_filter: bool = True) -> 'tuple[list, list[np.ndarray]]':222    '''Split an image into homogeneous regions.223    Use the method describe by (Tran et al. 2016) to split the image into224    multiple homogeneous regions.225    Args:226        img (cv2 image): the image to split227        rect (np.ndarray): bounding box of the all the CCs228        is_text (np.ndarray): boolean mask for the text CCs229        area (np.ndarray): area (number of filled pixels) for each CCs230        t (float): the threshold of pixels to ignore when computing homogeneity231        do_filter (bool): whether to execute the recursive filter when splitting.232    Returns:233        tuple[list, list[np.ndarray]]: list of regions and their coordinates on the original image.234    '''235    finished_regions = []236    finished_coords = []237    regions = [img]238    coords = [(0, 0, img.shape[1], img.shape[0])]239    all_coords = [coords[0]]240    new_regions = [0]241    while len(new_regions) > 0:242        new_regions = []243        new_homo = []244        new_coords = []245        for i in range(len(regions)):246            # print('in', coords[i])247            x, y, w, h = coords[i]248            # s = int(np.sqrt(w*h) * 0.05)249            homo = check_homogeneity(regions[i], int(w*0.05), 0, int(w*t)) and check_homogeneity(regions[i], int(h*0.05), 1, int(h*t))250            if homo:251                # print('homo!')252                finished_regions.append(regions[i])253                finished_coords.append(coords[i])254            else:255                hdivs = get_division(regions[i], 1, int(w * t))256                vdivs = get_division(regions[i], 0, int(h * t))257                divs = []258                for h in hdivs:259                    for v in vdivs:260                        x1, x2 = min(v[0], v[1]), max(v[0], v[1])261                        y1, y2 = min(h[0], h[1]), max(h[0], h[1])262                        divs.append((x1, x2, y1, y2))263                # print('got', len(divs), 'divisions')264                265                for x1,x2,y1,y2 in divs:266                    rct = (x+x1, y+y1, x2-x1, y2-y1)267                    if x2-x1 > 3 and y2-y1 > 3 and rct not in all_coords:268                        # print('found', rct)269                        if do_filter:270                            filtered = regions[i][y1:y2, x1:x2].copy()271                            recursive_filter(filtered, rct, rect, is_text, area)272                            if converge(regions[i][y1:y2, x1:x2], filtered):273                                finished_regions.append(filtered)274                                finished_coords.append(rct)275                            else:276                                new_coords.append(rct)277                                new_regions.append(filtered)278                        else:279                            new_coords.append(rct)280                            new_regions.append(regions[i][y1:y2, x1:x2])281                            # if new_regions[-1].shape[0] != rct[3] or new_regions[-1].shape[1] != rct[2]:282                            #     print(new_regions[-1].shape, rct)283                        all_coords.append(rct)284                if len(divs) == 0:285                    # print('unable to divide')286                    finished_regions.append(regions[i])287                    finished_coords.append(coords[i])288                    289        # print('scanned', len(regions), 'regions.', len(new_regions), 'new regions found')290        regions = new_regions291        homo = new_homo292        coords = new_coords293    294    return finished_regions, finished_coords295### Filtro Recursivo296def converge(region, after_filter) -> bool:297    '''Check the regions against the convergence criteria.298    Args:299        region (cv2 image): region before operation300        after_filter (cv2 image): region after operation301    Returns:302        bool: True if the algorithm converged for this region303    '''304    Su = np.sum(region)305    Sv = np.sum(after_filter)306    return Su == Sv or Sv == 0307def compute_k(omega: np.ndarray) -> float:308    '''Calculate the k-value for each omega list309    Args:310        omega (np.ndarray): array of widths, heights or areas of the CCs in the region311    312    Returns:313        float: the k calculated by the formula defined in (Tran et al. 2016)314    '''315    return max(np.mean(omega) / np.median(omega), np.median(omega) / np.mean(omega))316def compute_suspected_max(omega: np.ndarray, k: float) -> np.ndarray:317    '''Find the suspected non-text elements by the maximum-median filter.318    Args:319        omega (np.ndarray): array of widths, heights or areas of the CCs in the region320        k (float): the k calculated by the formula defined in (Tran et al. 2016)321    322    Returns:323        np.array: boolean mask for the suspected non-text elements324    '''325    return (omega == np.max(omega)) & (omega > k * np.median(omega))326def compute_suspected_min(omega, k):327    '''Find the suspected non-text elements by the minimum-median filter.328    Args:329        omega (np.ndarray): array of widths, heights or areas of the CCs in the region330        k (float): the k calculated by the formula defined in (Tran et al. 2016)331    332    Returns:333        np.array: boolean mask for the suspected non-text elements334    '''335    return (omega == np.min(omega)) & (omega < np.median(omega) / k)336def is_in_range(v: np.ndarray, start: int, end: int) -> np.ndarray:337    '''Checks if the elements in a vector lie in an interval.338    Args:339        v (np.ndarray): vector of CCs to check340        start (int): start of the interval341        end (int): end of the interval342    343    Returns:344        np.ndarray: boolean mask for the CCs that are in the range345    '''346    return (v > start) & (v < end)347def get_neigh(CCu: np.ndarray) -> 'tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]':348    '''Find the neighbouring CCs for each CC.349    Use the method described by (Chen et al. 2013) to calculate the neighbours of a CC350    Args:351        CCu (np.ndarray): all the CCs to use in the analysis352    353    Returns:354        tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: LNN (left nearest355        neighbour), RNN (right nearest neighbour), LNWS (left nearest white356        space) and RNWS (right nearest whitespace).357    '''358    lnn = np.zeros(CCu.shape[0])359    rnn = np.zeros(CCu.shape[0])360    lnws = np.zeros(CCu.shape[0])361    rnws = np.zeros(CCu.shape[0])362            363    for i in range(CCu.shape[0]):364        CCi = CCu[i]365        overlap1 = is_in_range(CCu[:, 1], CCi[1], CCi[1] + CCi[3])366        overlap2 = is_in_range(CCu[:, 1] + CCu[:, 3], CCi[1], CCi[1] + CCi[3])367        overlap3 = is_in_range(CCi[1], CCu[:, 1], CCu[:, 1] + CCu[:, 3])368        overlap4 = is_in_range(CCi[1] + CCi[3], CCu[:, 1], CCu[:, 1] + CCu[:, 3])369        vertical_overlap = overlap1 | overlap2 | overlap3 | overlap4370        ws_left = (CCu[:,0] + CCu[:,2]) - CCi[0]371        ws_right = CCu[:,0] - (CCi[0] + CCi[2])372        _lnn = np.argsort(ws_left)373        _rnn = np.argsort(ws_right)374        _lnn = _lnn[vertical_overlap[_lnn]]375        _rnn = _rnn[vertical_overlap[_rnn]]376        lnn[i] = _lnn[0] if _lnn.shape[0] > 0 else -1377        rnn[i] = _rnn[0] if _rnn.shape[0] > 0 else -1378        lnws[i] = ws_left[_lnn[0]] if _lnn.shape[0] > 0 else -1379        rnws[i] = ws_right[_rnn[0]] if _rnn.shape[0] > 0 else -1380    381    return lnn, rnn, lnws, rnws382def get_cc_in_region(region: np.ndarray, cc: np.ndarray) -> 'list[tuple[int, int, int, int]]':383    '''Find all the CCs contained in a region.384    Args:385        region (np.ndarray): bounding box of the region386        cc (np.ndarray): bounding box for all of the CCs in the image387    388    Returns:389        list[tuple[int, int, int, int]]: list of the bounding boxes of all the390        CCs contained in the region.391    '''392    return [(cc[i][0],cc[i][1],cc[i][2],cc[i][3], i) for i in range(cc.shape[0]) if cc[i][0] > region[0] and cc[i][0]+cc[i][2] < region[0]+region[2] and cc[i][1] > region[1] and cc[i][1]+cc[i][3] < region[1]+region[3]]393def recursive_filter(region, coords: np.ndarray, rect: np.ndarray, is_text: np.ndarray, area: np.ndarray):394    '''Apply the recursive filter to a region.395    Use the recursive filter described by (Tran et al. 2016) to eliminate396    non-text elements not caught by the heuristic filter.397    Args:398        region (cv2 image): image to apply the filter399        coords (np.ndarray): bounding box of the region400        rect (np.ndarray): bounding box of the all the CCs401        is_text (np.ndarray): boolean mask for the text CCs402        area (np.ndarray): area (number of filled pixels) for each CCs403    '''404    CCs = np.array(get_cc_in_region(coords, rect[is_text]))405    if CCs.shape[0] == 0: return406    indicies = CCs[:,-1]407    CCu = CCs[:,:-1]408    omega1 = area[is_text][indicies]#np.array([CCi[2]*CCi[3] for CCi in CCu])409    omega2 = np.array([CCi[3] for CCi in CCu])410    omega3 = np.array([CCi[2] for CCi in CCu])411    412    lnn, rnn, lnws, rnws = get_neigh(CCu)413    414    num_ln = np.array([(lnn == i).sum() for i in range(lnn.shape[0])])415    num_rn = np.array([(rnn == i).sum() for i in range(rnn.shape[0])])416    ws = rnws[rnws > 0] if (rnws>0).any() else np.array([0])417    k1, k2, k3 = compute_k(omega1), compute_k(omega2), compute_k(omega3)418    # maximum median filter419    suspected = compute_suspected_max(omega1, k1) & (compute_suspected_max(omega2, k2) | compute_suspected_max(omega2, k3))420    lnws[lnws == -1] = 1e10421    rnws[rnws == -1] = 1e10422    mi = np.min([lnws, rnws], axis=0)423    cond1 = mi > max(np.median(ws), np.mean(ws))424    lnws[lnws == 1e10] = -1425    rnws[rnws == 1e10] = -1426    ma = np.max([lnws, rnws], axis=0)427    cond1 &= (ma == np.max(ws)) | (mi > 2 * np.mean(ws))428    cond2 = (num_ln == np.max(num_ln)) & (num_ln > 2)429    cond2 |= (num_rn == np.max(num_rn)) & (num_rn > 2)430    non_text = suspected & (cond1 | cond2)431    # minimum median filter432    suspected = compute_suspected_min(omega2, k2) | compute_suspected_min(omega3, k3)433    lnws[lnws == -1] = 1e10434    rnws[rnws == -1] = 1e10435    mi = np.min([lnws, rnws], axis=0)436    cond1 = mi > max(np.median(ws), np.mean(ws))437    non_text |= suspected & cond1438    i = 0439    for x,y,w,h in CCu[non_text]:440        x -= coords[0]441        y -= coords[1]442        cv2.rectangle(region, (x, y), (x+w, y+h), 0, -1)443        is_text[is_text][indicies[i]] = False444        i += 1445### Classificação Multi-Layer446def multi_layer(img, rect: np.ndarray, is_text: np.ndarray, area: np.ndarray, t: float = 0):447    '''Apply the multy-layer classification to an image.448    Use the method described by (Tran et al. 2017) to eliminate further non-text449    elements.450    Args:451        img (cv2 image): image to apply the ML classification452        rect (np.ndarray): bounding box of the all the CCs453        is_text (np.ndarray): boolean mask for the text CCs454        area (np.ndarray): area (number of filled pixels) for each CCs455        t (float): the threshold of pixels to ignore456    457    Returns:458        cv2 image: text image after the removal of all the non-text elements459    '''460    prev = img.copy() * 0461    current = img.copy()462    i = 0463    while not converge(prev, current):464        rs = []465        cs = []466        hdivs = get_division(current, 1, int(img.shape[0] * t))467        vdivs = get_division(current, 0, int(img.shape[1] * t))468        divs = []469        for h in hdivs:470            for v in vdivs:471                x1, x2 = min(v[0], v[1]), max(v[0], v[1])472                y1, y2 = min(h[0], h[1]), max(h[0], h[1])473                divs.append((x1, x2, y1, y2))        474        for x1,x2,y1,y2 in divs:475            rct = (x1, y1, x2-x1, y2-y1)476            cs.append(rct)477            rs.append(current[y1:y2, x1:x2])478        479        prev = current480        current = current.copy() * 0481        for i in range(len(rs)):482            recursive_filter(rs[i], cs[i], rect, is_text, area)483            x,y,w,h = cs[i]484            current[y:y+h, x:x+w] = rs[i]485        i += 1486    # print(i, 'iterations')487    return current488def segment(img_bw, temp_folder: str = None, output_path: str = None) -> 'tuple[np.ndarray, list, list[np.ndarray]]':489    '''Segment an image using an MHS based approach.490    Implements a MHS (Tran et al. 2017) based approach for document text region491    identification based on homogeneity.492    Args:493        img_bw (cv2 image): binarized image to segment494        temp_folder (str): folder to save intermediary files to, if None does not save. default=None495        output_path (str): path to the resulting image with only text elements, if None does not save. default=None496    497    Returns:498        tuple[np.ndarray, list, list[np.ndarray]]: the text document, a list of499        all the regions and all of their coordinates.500    '''501    _, thresh = cv2.threshold(img_bw, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)502    area, density, rect, inc, hw_rate = cc_analisys(thresh)503    thresh, is_text = heuristic_filter(thresh, area, density, rect, inc, hw_rate)504    conditional_save(thresh, get_conditional_path('heuristic_filter.png', temp_folder))505    # in case there is a text element that is now empty, make it non-text506    for i in range(rect.shape[0]):507        if is_text[i]:508            x,y,w,h = rect[i]509            is_text[i] = np.any(thresh[y:y+h,x:x+w] > 0)510    511    if temp_folder:512        img_boxes = thresh.copy()513        for r in rect[is_text]:514            x,y,w,h = r515            cv2.rectangle(img_boxes, (x,y), (x+w,y+h), 128, 2)516        conditional_save(img_boxes, get_conditional_path('text_ccs.png', temp_folder))517    518    # print('before:', is_text.sum())519    rs, cs = recursive_splitting(thresh, rect, is_text, area, t=0.01)520    # print('after:', is_text.sum())521    522    # remove empty(-ish) regions523    new_rs = [rs[i] for i in range(len(rs)) if np.sum(rs[i] > 0) / (cs[i][2]*cs[i][3]) > 0.01]524    new_cs = [cs[i] for i in range(len(rs)) if np.sum(rs[i] > 0) / (cs[i][2]*cs[i][3]) > 0.01]525    526    rs, cs = new_rs, new_cs527    if temp_folder:528        img_boxes = thresh.copy()529        for r in cs:530            x,y,w,h = r531            cv2.rectangle(img_boxes, (x,y), (x+w,y+h), 128, 2)532        conditional_save(img_boxes, get_conditional_path('multilevel_regions.png', temp_folder))533        534    img = thresh.copy() * 0535    for i in range(len(rs)):536        x,y,w,h = cs[i]537        img[y:y+h, x:x+w] = rs[i]538    conditional_save(img_boxes, get_conditional_path('multi_level.png', temp_folder))539    540    # remove the text CCs now empty541    CCt = np.argwhere(is_text).flatten()542    for i in CCt:543        x,y,w,h = rect[i]544        if np.sum(img[y:y+h, x:x+w] > 0) == 0:545            is_text[i] = False546    # print('before:', is_text.sum())547    img = multi_layer(img, rect, is_text, area, t=0.01)548    # print('after:', is_text.sum())549    conditional_save(img, get_conditional_path('multi_layer.png', temp_folder))550    551    ### Segmentação de Regiões Homogêneas552    rs, cs = recursive_splitting(img, rect, is_text, area, t=0, do_filter=False)553    new_rs = [rs[i] for i in range(len(rs)) if np.sum(rs[i] > 0) / (cs[i][2]*cs[i][3]) > 0.01]554    new_cs = [cs[i] for i in range(len(rs)) if np.sum(rs[i] > 0) / (cs[i][2]*cs[i][3]) > 0.01]555    rs, cs = new_rs, new_cs556    if temp_folder:557        img_boxes = img.copy()558        for r in cs:559            x,y,w,h = r560            cv2.rectangle(img_boxes, (x,y), (x+w,y+h), 128, 2)561        conditional_save(img_boxes, get_conditional_path('mhs_boxes.png', temp_folder))562    563    conditional_save(img, output_path)564        ...language.py
Source:language.py  
1"""2Copyright (c) 2017 Wind River Systems, Inc.3Licensed under the Apache License, Version 2.0 (the "License");4you may not use this file except in compliance with the License.5You may obtain a copy of the License at:6    http://www.apache.org/licenses/LICENSE-2.07Unless required by applicable law or agreed to in writing, software  distributed8under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES9OR CONDITIONS OF ANY KIND, either express or implied.10"""11from abc import ABCMeta12class LanguageType(ABCMeta):13    def __repr__(cls):14        return cls.string_repr15    def __eq__(cls, other):16        return cls.string_repr == other.string_repr17    def __ne__(cls, other):18        return cls.string_repr != other.string_repr19class Language(object):20    """Defines set of supported file languages and their respective file extensions21    """22    class Unknown(metaclass=LanguageType):23        string_repr = "unknown"24        is_text = False25        is_binary = False26        is_source_code = False27        extensions = []28    class Binary(metaclass=LanguageType):29        string_repr = "binary"30        is_text = False31        is_binary = True32        is_source_code = False33        extensions = []34    class PlainText(metaclass=LanguageType):35        string_repr = "all"36        is_text = True37        is_binary = False38        is_source_code = False39        extensions = ["txt", "text", "xml", "html", "xsl", "xspf"]40    class C(metaclass=LanguageType):41        string_repr = "c"42        is_text = True43        is_binary = False44        is_source_code = True45        extensions = ["c", "cc", "cp", "cpp", "c++", "cxx", "h", "hh", "hxx", "hpp", "h++", "moc"]46    class Python(metaclass=LanguageType):47        string_repr = "python"48        is_text = True49        is_binary = False50        is_source_code = True51        extensions = ["py", "rpy", "pyt", "pyw", "pym", "re"]52    class Java(metaclass=LanguageType):53        string_repr = "java"54        is_text = True55        is_binary = False56        is_source_code = True57        extensions = ["java", "jsp", "j"]58    class Shell(metaclass=LanguageType):59        string_repr = "shell"60        is_text = True61        is_binary = False62        is_source_code = True63        extensions = ["sh", "csh", "ksh", "run", "bsh", "bash"]64    class Perl(metaclass=LanguageType):65        string_repr = "perl"66        is_text = True67        is_binary = False68        is_source_code = True69        extensions = ["pl"]70    class Javascript(metaclass=LanguageType):71        string_repr = "javascript"72        is_text = True73        is_binary = False74        is_source_code = True75        extensions = ["js", "javascript", "json"]76    class Scala(metaclass=LanguageType):77        string_repr = "scala"78        is_text = True79        is_binary = False80        is_source_code = True81        extensions = ["scala"]82    class MSDOS(metaclass=LanguageType):83        string_repr = "msdos"84        is_text = True85        is_binary = False86        is_source_code = True87        extensions = ["bat"]88    class Haskell(metaclass=LanguageType):89        string_repr = "haskell"90        is_text = True91        is_binary = False92        is_source_code = True93        extensions = ["hs", "lhs"]94    class PHP(metaclass=LanguageType):95        string_repr = "php"96        is_text = True97        is_binary = False98        is_source_code = True99        extensions = ["php"]100    class Patch(metaclass=LanguageType):101        string_repr = "patch"102        is_text = True103        is_binary = False104        is_source_code = True105        extensions = ["patch"]106    class Pascal(metaclass=LanguageType):107        string_repr = "pascal"108        is_text = True109        is_binary = False110        is_source_code = True111        extensions = ["p"]112    @staticmethod113    def language_list():114        return [getattr(Language, attr) for attr in Language.__dict__.keys() \115            if type(getattr(Language, attr)) == LanguageType]116    @staticmethod117    def text_languages():118        return [str(lang) for lang in Language.language_list() if lang.is_text]119    @staticmethod120    def guess_language(file_extension):121        for lang in Language.language_list():122            if file_extension in lang.extensions:123                return lang...test_trim.py
Source:test_trim.py  
...35        self.assertEqual('\n', trim.trim(''))36        self.assertEqual('\n', trim.trim('\n'))37    def test_trim_should_leave_leading_whitespace(self):38        self.assertEqual(' abc\n', trim.trim(' abc\n'))39    def test_is_text(self):40        self.assertTrue(trim.is_text(os.path.join(ROOT_DIR, 'README.rst')))41        self.assertTrue(trim.is_text(os.path.join(ROOT_DIR, 'trim')))42        self.assertFalse(trim.is_text(sys.executable))43        self.assertFalse(trim.is_text('/bin/bash'))44        self.assertFalse(trim.is_text('/usr/bin/env'))45        self.assertFalse(trim.is_text('non_existent_file'))46    def test_is_text_should_consider_symlinks_as_non_text(self):47        self.assertFalse(trim.is_text(os.path.join(ROOT_DIR, 'trim.py')))48    def test_is_text_should_consider_whitespace_only_as_text(self):49        import tempfile50        with tempfile.NamedTemporaryFile(mode='w') as temporary_file:51            temporary_file.write('     ')52            temporary_file.flush()53            self.assertTrue(trim.is_text(temporary_file.name))54    def test_is_text_should_consider_empty_files_as_non_text(self):55        import tempfile56        with tempfile.NamedTemporaryFile(mode='w') as temporary_file:57            temporary_file.write('')58            temporary_file.flush()59            self.assertFalse(trim.is_text(temporary_file.name))60    def test_system(self):61        text = 'abc   \n   1234  \n\n  \n'62        import tempfile63        with tempfile.NamedTemporaryFile(delete=False,64                                         mode='w') as temporary_file:65            temporary_file.write(text)66        import subprocess67        process = subprocess.Popen([sys.executable,68                                    os.path.join(ROOT_DIR, 'trim'),69                                    temporary_file.name],70                                   stderr=subprocess.PIPE)71        process.communicate()72        self.assertEqual(0, process.returncode)73        with open(temporary_file.name) as input_file:...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
