Source code for opendlp.regex_generation.generator

from opendlp.regex_generation.config import conf
from opendlp.regex_generation.dataset import Dataset
from opendlp.regex_generation.bpe import learn_bpe
from opendlp.regex_generation.node_factory import NodeFactory
from opendlp.regex_generation.generations import DatasetPopulationGenerator
from opendlp.regex_generation.generations import RandomPopulationGenerator
from opendlp.regex_generation.generations import PopulationInitializer
from opendlp.regex_generation.fitness.objective import Objective
from opendlp.regex_generation.config.evolve_param import EvolutionParam
from opendlp.regex_generation.evolution import Evolution, Selection, Variation
from opendlp.regex_generation.utils import get_fitness_rank, get_best_fitness_precison


[docs]def generate(regex_name, train_data_file, init_population_size=1000, max_iterations=2000, precision_divide_conquer=0.8, iteration_divide_conquer=30, noise_positive_sample_ratio=0.05, population_size_decay_rate=0.95, min_population_size=200): """生成正则表达式 Args: regex_name: 生成的正则表达式的名称 train_data_file: 用于生成正则表达式的训练数据,有两列,列名分布为"positive"和"negative",表示正、负样本 init_population_size: 初始正则表达式种群大小 max_iterations: 最大迭代次数 precision_divide_conquer: 子正则表达式的最小精确率阈值 iteration_divide_conquer: 子正则表达式的最小迭代次数阈值 noise_positive_sample_ratio: 噪声正样本比例,即允许生成的正则表达式无法匹配少量噪声正样本 population_size_decay_rate: 正则表达式种群大小衰减参数 min_population_size: 最小正则表达式种群大小,达到该大小后不再衰减 Returns: 生成的正则表达式结果字典, 形式为{"regex_name": str, "regex_pattern": str}。 """ # 创建数据集 dataset = Dataset(train_data_file) dataset.build() ori_pos_sample_num = len(dataset.pos_examples) # 学习bpe token pair_percent = conf.BPE_PAIR_PERCENT_THRESHOLD char_percent = conf.BPE_CHAR_PERCENT_THRESHOLD bpe_token_dict = learn_bpe(dataset.pos_examples, pair_percent, char_percent, dataset.is_fixed_length) # 构建terminalSet和functionSet node_facto = NodeFactory() node_facto.build() # 默认的terminalSet和functionSet初始化 ### 初始化种群 dataset_popu_gen = DatasetPopulationGenerator(dataset, bpe_token_dict) random_popu_gen = RandomPopulationGenerator(conf.MAX_DEPTH, node_facto) popu_initializer = PopulationInitializer(dataset_popu_gen, random_popu_gen) population = popu_initializer.initlize(init_population_size) # 演化 objective = Objective(dataset) result_fitness = [] evolve_param = EvolutionParam(population_size_decay_rate, min_population_size) selection = Selection() variation = Variation(evolve_param, bpe_token_dict, random_popu_gen) evolution = Evolution(evolve_param, selection, variation, random_popu_gen) iter_best = 0 for g in range(max_iterations): fitness_ranked = get_fitness_rank(population, objective) best_fitness = get_best_fitness_precison(fitness_ranked) print('{}\t generation: {}\t best: {}\t '.format(regex_name, g, best_fitness.tree.form(''))) best_precision = best_fitness.fitness_arr[0] iter_best += 1 if best_precision >= precision_divide_conquer \ and iter_best >= iteration_divide_conquer: result_fitness.append(best_fitness) dataset.remove_by_regex(best_fitness.tree) if len(dataset.pos_examples) < ori_pos_sample_num * noise_positive_sample_ratio: break population = popu_initializer.initlize(len(population)) iter_best = 0 else: population = evolution.evolve(len(population), fitness_ranked) # 输出结果 result_regexes = [] for i, fitness in enumerate(result_fitness): sub_regex_string = fitness.tree.form('', conf.RegexFlavour.Python) print('sub_regex_{}: {}'.format(i, sub_regex_string)) result_regexes.append(sub_regex_string) regex_string = '|'.join(result_regexes) print('{}: {}'.format(regex_name, regex_string)) result = {'regex_name': regex_name, 'regex_pattern': regex_string} return result
if __name__ == '__main__': import os import pandas as pd data_dir = '../../tests/data/regex_generation/test-data/' regex_names = ['ID_CARD', 'TELEPHONE', 'MOBILE_PHONE', 'EMAIL', 'LICENSE_PLATE', 'BANK_CARD', 'PASSPORT', 'SOCIAL_CREDIT_CODE', 'IPV4', 'IPV6', 'MAC', 'DOMAIN_NAME', 'POSTCODE', 'DATE'] result_dict = {'regex_name': [], 'regex_pattern':[]} for regex_name in regex_names: train_data_file = os.path.join(data_dir, regex_name + '.csv') result = generate(regex_name, train_data_file) print('result: ', result) print('\n') result_dict['regex_name'].append(result['regex_name']) result_dict['regex_pattern'].append(result['regex_pattern']) df = pd.DataFrame(result_dict) df.to_csv('result.csv', index=False)