نرم‌افزار پارسیک‌نمایِ پی‌نویس - PN²

**Mehrbod** · 05-03-2013, 02:18 PM

نرمافزار PârsikNemâye PeyNevis یا PN² یک برنامه برای شناسایی خودکار واژگان پارسیک و برچسب زدن آنها میباشد.

نرم‌افزار به زبان Python نوشته شده و از این فندآوری‌ها^[1] میبهرد^[2]:

١. Python Programming Language Official Website
٢. MongoDB

از آنجاییکه port کردن برنامه برای همگان کارِ دشوار و زمانبری است, چندتایی از راهکارهایی که پیشتر در آورده‌ام را اینجا میگذارم, شاید به کار دیگران بیایند.

unipers/mapped_chars.ini

کد:

آ	â
ا	a
ب	b
پ	p
ت	t
ث	s
ج	j
چ	c
ح	h
خ	x
د	d
ذ	z
ر	r
ز	z
ژ	ž
س	s
ش	š
ص	s
ض	z
ط	t
ظ	z
ع	å
غ	q
ف	f
ق	q
ک	k
گ	g
ل	l
م	m
ن	n
#و	ů
و	v
ه	h
ی	y
ء	$
آ	â
اً	$
هٔ	$
ة	$
ٸ	'

ترانویس‌ها (هیچکدام ١٠٠% درست کار نمیکند)

کد:

_unimapped     =   {}
with open('unipers/mapped_chars.ini','r',encoding='utf-8') as file:
  lines = sorted(file.read().split('\n'), key=lambda l: l.split('\t')[0])
for line in lines:
    if line.count('\t') == 0 or line.startswith('#'): continue
    a, b = line.split('\t')[1], line.split('\t')[0]
    if a in _unimapped: continue
    _unimapped[a] = b
_unimapped['i'] = 'ی'
_unimapped['u'] = 'و'; _unimapped['v'] = 'و'; _unimapped['w'] = 'و'
_unimapped[' '] = ' '
_unimapped.update({c:c for c in string.punctuation})

def stripvowels(word):
  return re.sub('(ِ|ُ|َ|ْ|ّ)+', '', word)

def unipers2roman(word, reverse=False):
  mapped = bidict({'x':'kh', 'ž':'zh', 'š':'sh', 'c':'ch'})
  for k, v in (mapped.items() if not reverse else mapped.inv.items()):
    word = word.replace(k, v)
  return word

def unipers2perso(word, vowels=False, hint=None):
  if len(word) == 0: return ''
  ret = ''
  _vowels = {'e':'ِ', 'o':'ُ', 'a':'َ'}
  mfaced = 't z s h'.split()
  mfaces = {
    't':'ت ط'.split(),
    'z':'ز ذ ض ظ'.split(),
    's':'س ص ث'.split(),
    'h':'ه ح'.split(),
    }
  idx = 0
  for idx, c in enumerate(word.lower()):
    if idx < len(word) -1 and c == 'i' and word[idx+1] == 'y':
      continue
    if idx == 0 and c in ['i']: ret += 'ای'
    elif idx == 0 and c in ['u']: ret += 'او'
    elif idx == 0 and c in ['e', 'o', 'a']: ret += 'ا'
    elif c in ['e', 'o', 'a'] and idx > 0:
      if c in ['a'] and (word[idx-3:idx] in ['pas'] or word[idx-1] in ['i']): ret += 'ا'
      elif c in ['o'] and re.match('o(.udan|dâ)', word[idx:]): pass
      elif c in ['o'] and re.match('o([z])?ir', word[idx:]): pass
      elif c in ['o'] and idx < len(word)-2 and word[idx+1] in ['â','g','k','l','d', 'b', 'z']: ret += 'و'
      if vowels: ret += _vowels[c]
    else:
      if c in mfaced:
        if hint and idx < len(hint):
          l = mfaces[c]
          indices = sorted(filter(lambda x: x[0] > -1, ((hint.find(ch, idx), ch,) for ch in l)), key=lambda x: x[0])
          if indices:
            idx = indices[0][0]
            ret += indices[0][1] if idx < len(hint) else l[0]
          else:
            ret += l[0]
        else:
          ret += mfaces[c][0]
      elif c in ['e']: ret += 'ع'
      else: ret += _unimapped[c]
    if idx == len(word)-1 and c in ['a', 'e']: ret += 'ه'
    elif c in ['â']:
      if idx < len(word)-2 and word[idx+1] == 'i': ret += 'ی'
    elif c in ['e'] and not re.search('(.{1,3})(udan|â|uxtan|uz|stan)', word[idx:]):
      if idx == len(word)-1: ret += 'ه'
      elif idx < len(word) -1:
        if word[idx+1] in ['a', 'â', 's', 'p']: ret += 'ه' + '\u200c'
        elif word[idx+1] in ['i']: ret += 'ه' + '\u200cا'
  ret = ret[0] + ret[1:].replace('آ','ا')
  for c in ret:
    if c in ['ی','ن','و']: continue
    ret = re.sub('%s+' % re.escape(c),c, ret)
  return ret

گرفتن بن کنون از کارواژه^[3]:

augment_verb_exceptions.ini:

کد:

#Exception_Forms
zistan ziv
bihudan bihun

کد:

v_exceptions = dict(verb.split() for verb in filter(None, fread('augment_verb_exceptions.ini').splitlines()[1:]))

def konundis(verb):
  "Âhanješe bone konun az kârvâže"
  global v_exceptions
  prefix = ""
  if isinstance(verb, Word):
    if verb.roots:
      if verb.roots_into_verb:
        verb = ''.join(verb.roots[verb.roots_into_verb:]).lower()
      else:
        prefix = ''.join(verb.roots[:-1]).lower()
        verb = verb.roots[-1].lower()
      if verb in ['idan', 'dan']: return prefix
    else: verb = verb.text
  ret = jahanshiri((prefix+verb).lower())
  if len(ret) > 0: prefix = ''
  else:
    ret = jahanshiri(verb.lower())
  if len(ret) > 0: return prefix + ret
  else: ret = None
  if verb in v_exceptions:
    return prefix+v_exceptions[verb]
  mapped = [
    ['(.*?)oftan', '%sub'],
    ['(.*?)aftan', '%sâv'], ['(.*?)âftan', '%sâb'], ['(.*?)eftan', '%sev'], ['(.*?)ftan', '%sb'],
    ['(.*?)eštan', '%sis'],
    ['(.*?)aštan', '%sard'],
    ['(.*?)(štan|štân)', '%sr'],
    ['(.{1,3})astan', '%san'],
    ['(.*?)xtan', '%sz'],
    ['(.*?)udan', '%sâ'],
    ['(.*?)ostan', '%su'],
    ['(.{1,2})idan', '%sin'],
    ['(.{1,2})stan', '%ss'],
    ['(.*?)(adan|yidan|idan|âdan|dan|estan|stan|tan)', '%s'],
  ]

  for dis, fmt in mapped:
    if re.match(dis, verb):
      ret = fmt % re.match(dis, verb).groups()[0]
      break

  assert ret, "Kârvâžeye nâšenâxte: '%s'" % verb
  return prefix + ret

def gozaštedis(verb):
  "Âhanješe bone gozašte az kârvâže"
  return verb[:-2]
  
def jahanshiri(verb):
  "Gereftane konundis az jahanshiri.com"
  ret = ''
  url = 'http://www.jahanshiri.ir/pvc/conjpl.php?verb=%s&lang=en'
  tries = 0
  while True:
    try:
      page = tor.download(url % unipers2roman(verb).replace('â', '%C3%A2'))[0]
      break
    except Exception as ex:
      if tries > 3: break
      tries += 1
  assert page, "Download failed for '%s'" % verb
  if page.find('present stem') == -1: return ret
  ret = re.search('[/td][/tr][tr](?:[td].*?[/td]){2}[td](.*?)[/td]', page).groups()[0]
  ret = unipers2roman(ret, reverse=True)
  return ret

----
1. ^ fand+âvar+i::Fandâvari || فنداوری: تکنولوژی Ϣiki-En technology
2. ^ Bahridan || بهریدن: بهره جستن; استفاده کردن Ϣiki-En to utilize; to use
3. ^ kâr+vâže::Kârvâže || کارواژه: فعل Dehxodâ verb

این تالار تنها ازبرای بایگانی نگهداری میشود; برای دسترسی به سخنگاه دفترچه اینجا کلیک کنید.

User Tag List

جُستار: نرم‌افزار پارسیک‌نمایِ پی‌نویس - PN²

ابزارهای جُستار

جستجو جُستار

شیوه‌ی نمایش جُستار

Threaded View

نرم‌افزار پارسیک‌نمایِ پی‌نویس - PN²

4 کاربر برای این پست سودمند از Mehrbod گرامی سپاسگزاری کرده اند:

داده‌های جُستار

کاربری که سرگرم دیدن این جُستار هستند

جُستارهای همانند

فاطی حقیقت جو: در یک انتخابات آزاد بین میرحسین موسوی و رضاپهلوی، قطعا میرحسین موسوی رئیس جمهور میشود

تقاضای زیرنویس کردن مجموعه "نوارهای بی‌خدایی"

کدام سرویس وبلاگدهی امن تر است.

کلیدواژگان این جُستار

مجوز های پیک و ویرایش